private static void TestProcess(Dictionary <string, string> informalExpMap) { // init Console.WriteLine("----- Test Process Text: -----"); string inText = "u rolling &amp; pls(12years)."; // test process: must use ArrayList<TextObj> List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(token => XmlHtmlHandler.Process(token)) .Select(token => LeadingPuncSplitter.Process(token)).SelectMany(token => TextObj.FlatTokenToArrayList(token)) .Select(token => LeadingDigitSplitter.Process(token)).SelectMany(token => TextObj.FlatTokenToArrayList(token)) .Select(token => InformalExpHandler.Process(token, informalExpMap)).ToList()); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("--------- LeadingDigitSplitter( ) Test ----------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); int index = 0; foreach (TokenObj tokenObj in outTokenList) { Console.WriteLine(index + "|" + tokenObj.ToHistString()); index++; } }
// privat methods private static void Test(string configFile) { // init Console.WriteLine("----- Test Pre-Correction Text: -----"); string inText = "We cant theredve hell.Plz u r good123. "; CSpellApi cSpellApi = new CSpellApi(configFile); int ndMaxSplitNo = cSpellApi.GetCanNdMaxSplitNo(); Dictionary <string, string> infExpMap = cSpellApi.GetInformalExpressionMap(); bool debugFlag = false; // 1. convert input to TokenObjs List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); List <TokenObj> outTokenList = ProcNdCorrector.Process(inTokenList, ndMaxSplitNo, infExpMap, debugFlag); string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("--------------------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); int index = 0; foreach (TokenObj tokenObj in outTokenList) { Console.WriteLine(index + "|" + tokenObj.ToHistString()); index++; } }
// public methods // private methods private static void Test() { int tarIndex = 6; // target index int startIndex = 4; // start index of merge int endIndex = 6; // end index of merge int tarPos = 3; // target pos int startPos = 2; // start pos of merge int endPos = 3; // end pos of merge int mergeNo = 1; // total no of merged tokens string tarWord = "gnosed"; // target term string mergeWord = "diagnosed."; // suggested merged terms string coreMergeWord = "diagnosed"; // core suggested merged terms string orgMergeWord = "dia gnosed"; // org word b4 merge MergeObj mergeObj = new MergeObj(tarWord, orgMergeWord, mergeWord, coreMergeWord, mergeNo, startIndex, tarIndex, endIndex, startPos, tarPos, endPos); string inText = "He is dia gnosed last week."; List <TokenObj> inTextList = TextObj.TextToTokenList(inText); List <TokenObj> nonSpaceTextList = TextObj.GetNonSpaceTokenObjList(inTextList); Console.WriteLine("------ Merge Obj -------"); Console.WriteLine(mergeObj.ToString()); Console.WriteLine("------ Non Merge Term -------"); string nonMergeTerm = GetNonMergeTerm(mergeObj, nonSpaceTextList); Console.WriteLine("- inText: [" + inText + "]"); Console.WriteLine("- nonMergeTerm: [" + nonMergeTerm + "]"); }
public static List <TokenObj> Process(string inText, int maxSpRecursiveNo, Dictionary <string, string> infExpMap) { List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); bool debugFlag = false; return(Process(inText, maxSpRecursiveNo, infExpMap, debugFlag)); }
private static void TestProcessText() { // init Console.WriteLine("----- Test Process Text: -----"); string inText = "Head rolling &amp; rock(5'8")."; int MaxRecursive = 5; // test process: must use ArrayList<TextObj> List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(token => XmlHtmlHandler.Process(token)).Select(token => LeadingPuncSplitter.Process(token, MaxRecursive)).ToList()); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("--------- LeadingPuncSplitter( ) Test -----------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); int index = 0; foreach (TokenObj tokenObj in outTokenList) { Console.WriteLine(index + "|" + tokenObj.ToHistString()); index++; } }
//the core of pre-correction api public static List <TokenObj> Process(string inText, int maxSpRecursiveNo, Dictionary <string, string> infExpMap, bool debugFlag) { // 1. input List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); // 2. process on each tokenObj return(ProcNdCorrector.Process(inTokenList, maxSpRecursiveNo, infExpMap, debugFlag)); }
/// <summary> /// cSpell correction process, output to an ArrayList of TokenObj by using /// funcMode and rankMode from configuratin files, with debug print option. /// </summary> /// <param name="inText"> input text to be corrected </param> /// <param name="debugFlag"> boolean flag for debug print </param> /// <returns> an ArrayList of TokenObj </returns> public virtual List <TokenObj> ProcessToTokenObj(string inText, bool debugFlag) { DebugPrint.Println("====== SpellApi.Process( ), funcMode: " + funcMode_ + ", rankMode: " + rankMode_ + " ======", debugFlag); // non-dictionary and dictionary base correction List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); List <TokenObj> outTokenList = CorrectionApi.ProcessByTokenObj(inTokenList, this, debugFlag); return(outTokenList); }
public static (string, List <TokenObj>) ProcessToStrExt(string inText, CSpellApi cSpellApi, bool debugFlag) { // 1. input List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi, debugFlag); // 2. convert results to text string outText = TextObj.TokenListToText(outTokenList); return(outText, outTokenList); }
public static string Process(string inWord, int maxProcess) { string lastText = inWord; string outText = Process(inWord); while ((maxProcess > 0) && (outText.Equals(lastText) == false)) { // recusively process lastText = outText; // converts to textObj for recursively process List <TokenObj> inTokenList = TextObj.TextToTokenList(lastText); List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(token => Process(token)).ToList()); outText = TextObj.TokenListToText(outTokenList); maxProcess--; } return(outText); }
private static void TestGetCorrectTerm(CSpellApi cSpellApi) { // init // all lowerCase string inText = "Dur ing my absent."; bool debugFlag = false; List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); // 1. convert to the non-empty token list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList); // result int tarPos = 0; MergeObj mergeObj = NonWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag); // print out Console.WriteLine("--------- GetCorrectTerm( ) -----------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("In nonSpaceTokenList: [" + nonSpaceTokenList.Count + "]"); Console.WriteLine("Out MergeObj: [" + mergeObj.ToString() + "]"); }
private static void TestProcess(CSpellApi cSpellApi) { // init // all lowerCase string inText = "hotflashes and knowaboutare not forr playsure."; List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); bool debugFlag = false; // process List <TokenObj> outTokenList = Process(inTokenList, cSpellApi, debugFlag); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("------ GetCorrection by Process( ) ------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); // print out operation details Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList)); }
private static void TestProcess(CSpellApi cSpellApi) { // init // all lowerCase string inText = "She had problems dur ing her pregnancies. That is a dis appoint ment. Good!"; // test process: must use ArrayList<TextObj> List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); bool debugFlag = false; // process List <TokenObj> outTokenList = Process(inTokenList, cSpellApi, debugFlag); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("------ GetCorrection by Process( ) ------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); // print out operation details Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList)); }
// private method private static void TestProcess(CSpellApi cSpellApi) { // init // test non-word, one-to-one, split, and merge correction, all lowerCase string inText = "hotflashes and knowaboutare not forr playsure dur ing my disa ppoint ment."; // test process: must use ArrayList<TextObj> List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); bool debugFlag = false; // process List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi, debugFlag); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("------ GetCorrection by Process( ) ------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); // print out operation details Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList)); }
// privat methods private static void Test(CSpellApi cSpellApi) { Console.WriteLine("----- Test Pre-Correction Text: -----"); string inText = "We cant spel ACHindex 987Pfimbria dianosed."; //CSpellApi cSpellApi = new CSpellApi(configFile); List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi); string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("--------------------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); int index = 0; foreach (TokenObj tokenObj in outTokenList) { Console.WriteLine(index + "|" + tokenObj.ToString()); index++; } }
// private methods private static void Test() { // init cSpellApi string configFile = "../data/Config/cSpell.properties"; CSpellApi cSpellApi = new CSpellApi(configFile); Console.WriteLine("===== Unit Test of MergeCandidates ====="); //String inText = "He was dia gnosed early onset deminita 3 year ago."; // example from 73.txt //String inText = "I have seven live births with no problems dur ing my pregnancies. That is a dis appoint ment"; string inText = "That is a disa ppoint ment."; List <TokenObj> inTextList = TextObj.TextToTokenList(inText); string inStr = String.Join("|", inTextList.Select(obj => obj.GetTokenStr())); Console.WriteLine(" - inTextList (" + inTextList.Count + "): [" + inStr + "]"); Console.WriteLine("-------------------------"); foreach (TokenObj tokenObj in inTextList) { Console.WriteLine(tokenObj.ToString()); } int tarPos = 4; Console.WriteLine("-------------------------"); Console.WriteLine("- tarPos: " + tarPos); Console.WriteLine("- maxMergeNo: " + cSpellApi.GetCanNwMaxMergeNo()); Console.WriteLine("------ merge set -------"); // pre-Process: convert to the non-empty token list List <TokenObj> nonSpaceTextList = TextObj.GetNonSpaceTokenObjList(inTextList); // get the candidate for a specified target position HashSet <MergeObj> mergeSet = GetCandidates(tarPos, nonSpaceTextList, cSpellApi); // print out foreach (MergeObj mergeObj in mergeSet) { Console.WriteLine(mergeObj.ToString()); } }
private static void Tests(Word2Vec w2vIm, Word2Vec w2vOm) { string inText = "... last 10 years #$% was dianosed test123 yahoo.com early on set deminita 3 year ago."; List <TokenObj> inTextList = TextObj.TextToTokenList(inText); Console.WriteLine("======= Word2VecContext ======================"); Console.WriteLine(" - inText: [" + inText + "]"); string inStr = String.Join("|", inTextList.Select(obj => obj.GetTokenStr())); Console.WriteLine(" - inTextList (" + inTextList.Count + "): [" + inStr + "]"); int tarPos = 0; int tarSize = 1; int index = 0; int radius = 3; bool debugFlag = false; Console.WriteLine("------ Test GetContext (no skip), radius=3 ..."); // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); foreach (TokenObj tokenObj in inTextList) { // not the space token if (tokenObj.IsSpaceToken() == false) { string tokenStr = tokenObj.GetTokenStr(); // word2VecSkipWord = false (no skip) List <string> contextList = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, false, debugFlag); string contextStr = String.Join("|", contextList); Console.WriteLine(tarPos + "|" + index + "|" + tokenStr + ": [" + contextStr + "]"); tarPos++; } index++; } Console.WriteLine("------ Test GetContext (skip) , radius=3 ..."); Console.WriteLine(" - inText: [" + inText + "]"); tarPos = 0; foreach (TokenObj tokenObj in inTextList) { // not the space token if (tokenObj.IsSpaceToken() == false) { string tokenStr = tokenObj.GetTokenStr(); // word2VecSkipWord = true (skip) List <string> contextList2 = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, true, debugFlag); string contextStr2 = String.Join("|", contextList2); Console.WriteLine(tarPos + "|" + index + "|" + tokenStr + ": [" + contextStr2 + "]"); tarPos++; } index++; } Console.WriteLine("------ Test GetContext (skip) , all ..."); Console.WriteLine(" - inText: [" + inText + "]"); tarPos = 0; // not the space token foreach (TokenObj tokenObj in nonSpaceTokenList) { string tokenStr = tokenObj.GetTokenStr(); // word2VecSkipWord = true (skip) List <string> contextList3 = GetContext(tarPos, tarSize, nonSpaceTokenList, w2vIm, true, debugFlag); string contextStr3 = String.Join("|", contextList3); Console.WriteLine(tarPos + "|" + tokenStr + ": [" + contextStr3 + "]"); tarPos++; } }
private static void Tests(Word2Vec w2vIm, Word2Vec w2vOm) { string inText = "for the last 10 years was dianosed\n early on set deminita 3 years ago"; List <TokenObj> inTextList = TextObj.TextToTokenList(inText); // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTextList); List <string> testStrList = new List <string>(); testStrList.Add("diagnosed"); testStrList.Add("diagnose"); testStrList.Add("dianosed"); // init context int tarPos = 6; int tarSize = 1; int radius = 2; bool word2VecSkipWord = true; bool debugFlag = false; DoubleVec contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, radius, word2VecSkipWord, debugFlag); Console.WriteLine("===== Test diagnosed|diagnose|dianosed (window-2) ====="); Console.WriteLine("inText: [" + inText + "]"); Console.WriteLine("============================================"); Console.WriteLine("Candidates|CBOW score|CBOW score 2|Similarity score"); Console.WriteLine("============================================"); foreach (string testStr in testStrList) { Test(testStr, contextVec, w2vIm, w2vOm); } Console.WriteLine("===== Test diagnosed|diagnose|dianosed (whole text) ====="); contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList, w2vIm, word2VecSkipWord, debugFlag); foreach (string testStr in testStrList) { Test(testStr, contextVec, w2vIm, w2vOm); } string inText1 = "Not all doctors know about this syndrome."; List <TokenObj> inTextList1 = TextObj.TextToTokenList(inText1); // remove space token from the list List <TokenObj> nonSpaceTokenList1 = TextObj.GetNonSpaceTokenObjList(inTextList1); Console.WriteLine("===== Test know about|know|about (window) ====="); List <string> testStrList1 = new List <string>(); testStrList1.Add("know about"); testStrList1.Add("know"); testStrList1.Add("about"); tarPos = 3; tarSize = 2; radius = 2; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, radius, word2VecSkipWord, debugFlag); Test(testStrList1[0], contextVec, w2vIm, w2vOm); contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, word2VecSkipWord, debugFlag); Test(testStrList1[0], contextVec, w2vIm, w2vOm); tarPos = 3; tarSize = 1; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, radius, word2VecSkipWord, debugFlag); Test(testStrList1[1], contextVec, w2vIm, w2vOm); contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, word2VecSkipWord, debugFlag); Test(testStrList1[1], contextVec, w2vIm, w2vOm); tarPos = 4; tarSize = 1; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, radius, word2VecSkipWord, debugFlag); Test(testStrList1[2], contextVec, w2vIm, w2vOm); contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList1, w2vIm, word2VecSkipWord, debugFlag); Test(testStrList1[2], contextVec, w2vIm, w2vOm); string inText2 = "for the last 10 years was diagnosed early on set dementia 3 years ago."; List <TokenObj> inTextList2 = TextObj.TextToTokenList(inText2); // remove space token from the list List <TokenObj> nonSpaceTokenList2 = TextObj.GetNonSpaceTokenObjList(inTextList2); List <string> testStrList2 = new List <string>(); testStrList2.Add("onset"); testStrList2.Add("on set"); Console.WriteLine("===== Test onset|on set (window-3) ====="); Console.WriteLine("inText: [" + inText + "]"); tarPos = 8; tarSize = 2; radius = 3; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, radius, word2VecSkipWord, debugFlag); foreach (string testStr in testStrList2) { Test(testStr, contextVec, w2vIm, w2vOm); } tarPos = 8; tarSize = 1; radius = 3; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, radius, word2VecSkipWord, debugFlag); Test("on", contextVec, w2vIm, w2vOm); tarPos = 9; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, radius, word2VecSkipWord, debugFlag); Test("set", contextVec, w2vIm, w2vOm); Console.WriteLine("===== Test onset|on set (whole text) ====="); radius = nonSpaceTokenList2.Count; contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, nonSpaceTokenList2, w2vIm, word2VecSkipWord, debugFlag); foreach (string testStr in testStrList2) { Test(testStr, contextVec, w2vIm, w2vOm); } Console.WriteLine("===== Go through each tokens with diff radius 1-9) ====="); Console.WriteLine("tarPos|tarWord|r=1|r=2|r=3|r=4|r=5|r=6|r=7|r=8|r=9"); //String inText3 = "Broken bones can not sleep at night!"; string inText3 = "not xyxy all doctors know about this syndrome."; List <TokenObj> inTextList3 = TextObj.TextToTokenList(inText3); // remove space token from the list List <TokenObj> nonSpaceTokenList3 = TextObj.GetNonSpaceTokenObjList(inTextList3); tarPos = 0; tarSize = 1; radius = 0; foreach (TokenObj tokenObj in nonSpaceTokenList3) { // skip the space token string tokenStr = tokenObj.GetTokenStr(); string inStr = Word2VecContext.NormWordForWord2Vec(tokenStr); Console.Write(tarPos + "|" + tokenStr + "|"); // print out all radius for (int r = 1; r < 10; r++) { contextVec = Word2VecContext.GetContextVec(tarPos, tarSize, inTextList2, w2vIm, r, word2VecSkipWord, debugFlag); TestWin(inStr, contextVec, w2vIm, w2vOm); } Console.WriteLine(""); tarPos++; } }