private static void TestProcessText() { // init Console.WriteLine("----- Test Process Text: -----"); string inText = "Head rolling &amp; rock(5'8")."; int MaxRecursive = 5; // test process: must use ArrayList<TextObj> List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(token => XmlHtmlHandler.Process(token)).Select(token => LeadingPuncSplitter.Process(token, MaxRecursive)).ToList()); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("--------- LeadingPuncSplitter( ) Test -----------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); int index = 0; foreach (TokenObj tokenObj in outTokenList) { Console.WriteLine(index + "|" + tokenObj.ToHistString()); index++; } }
private static void TestProcess(Dictionary <string, string> informalExpMap) { // init Console.WriteLine("----- Test Process Text: -----"); string inText = "u rolling &amp; pls(12years)."; // test process: must use ArrayList<TextObj> List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(token => XmlHtmlHandler.Process(token)) .Select(token => LeadingPuncSplitter.Process(token)).SelectMany(token => TextObj.FlatTokenToArrayList(token)) .Select(token => LeadingDigitSplitter.Process(token)).SelectMany(token => TextObj.FlatTokenToArrayList(token)) .Select(token => InformalExpHandler.Process(token, informalExpMap)).ToList()); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("--------- LeadingDigitSplitter( ) Test ----------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); int index = 0; foreach (TokenObj tokenObj in outTokenList) { Console.WriteLine(index + "|" + tokenObj.ToHistString()); index++; } }
// privat methods private static void Test(string configFile) { // init Console.WriteLine("----- Test Pre-Correction Text: -----"); string inText = "We cant theredve hell.Plz u r good123. "; CSpellApi cSpellApi = new CSpellApi(configFile); int ndMaxSplitNo = cSpellApi.GetCanNdMaxSplitNo(); Dictionary <string, string> infExpMap = cSpellApi.GetInformalExpressionMap(); bool debugFlag = false; // 1. convert input to TokenObjs List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); List <TokenObj> outTokenList = ProcNdCorrector.Process(inTokenList, ndMaxSplitNo, infExpMap, debugFlag); string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("--------------------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); int index = 0; foreach (TokenObj tokenObj in outTokenList) { Console.WriteLine(index + "|" + tokenObj.ToHistString()); index++; } }
// use TextObj (instead of TextIoObj) public static string ProcessByStr(string inText, int maxSpRecursiveNo, Dictionary <string, string> infExpMap, bool debugFlag) { List <TokenObj> outTokenList = Process(inText, maxSpRecursiveNo, infExpMap, debugFlag); // result text: convert from TokenObj to str string outText = TextObj.TokenListToText(outTokenList); return(outText); }
public static (string, List <TokenObj>) ProcessToStrExt(string inText, CSpellApi cSpellApi, bool debugFlag) { // 1. input List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi, debugFlag); // 2. convert results to text string outText = TextObj.TokenListToText(outTokenList); return(outText, outTokenList); }
// public method // process public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag) { DebugPrint.PrintProcess("5. RealWord-Merge", debugFlag); DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag); // pre-porcess // update Pos for the inTokenList TextObj.UpdateIndexPos(inTokenList); // 1. remove non space-token and convert to non-space-token list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList); // 2. process: go through each token for detection and correction // to find merge corrections (mergeObjList) int index = 0; List <MergeObj> mergeObjList = new List <MergeObj>(); int maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength(); while (index < inTokenList.Count) { TokenObj curTokenObj = inTokenList[index]; // update the tarPos // SCR-3, use legit token if (curTokenObj.IsLegitToken(maxLegitTokenLength) == true) { int tarPos = inTokenList[index].GetPos(); // correct term is the highest ranked candidates MergeObj mergeObj = RealWordMergeCorrector.GetCorrectTerm(tarPos, nonSpaceTokenList, cSpellApi, debugFlag); if (mergeObj == null) // no merge correction { index++; } else // has merge correction { mergeObjList.Add(mergeObj); // next token after end token, this ensures no overlap merge index = mergeObj.GetEndIndex() + 1; } } else // space token // update index { index++; } } // update the output for merge for the whole inTokenList, // has to update after the loop bz merge might // happen to the previous token // update the tokenObj up to the merge, then go to the next token // update operation info also List <TokenObj> outTokenList = MergeCorrector.CorrectTokenListByMerge(inTokenList, mergeObjList, TokenObj.HIST_RW_M, debugFlag, cSpellApi); return(outTokenList); }
public static string Process(string inWord, int maxProcess) { string lastText = inWord; string outText = Process(inWord); while ((maxProcess > 0) && (outText.Equals(lastText) == false)) { // recusively process lastText = outText; // converts to textObj for recursively process List <TokenObj> inTokenList = TextObj.TextToTokenList(lastText); List <TokenObj> outTokenList = new List <TokenObj>(inTokenList.Select(token => Process(token)).ToList()); outText = TextObj.TokenListToText(outTokenList); maxProcess--; } return(outText); }
public static List <TokenObj> Process(List <TokenObj> inTokenList, int ndMaxSplitNo, Dictionary <string, string> infExpMap, bool debugFlag) { DebugPrint.PrintProcess("1. NonDictionary", debugFlag); DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag); // process on each tokenObj List <TokenObj> outTokenList = new List <TokenObj>(inTokenList .Select(token => XmlHtmlHandler.Process(token, debugFlag)) .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)) .SelectMany(token => TextObj.FlatTokenToArrayList(token)) .Select(token => LeadingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)) .SelectMany(token => TextObj.FlatTokenToArrayList(token)) .Select(token => LeadingDigitSplitter.Process(token, debugFlag)) .SelectMany(token => TextObj.FlatTokenToArrayList(token)) .Select(token => EndingDigitSplitter.Process(token, debugFlag)) .SelectMany(token => TextObj.FlatTokenToArrayList(token)) .Select(token => InformalExpHandler.Process(token, infExpMap, debugFlag)).ToList()); /* * List<TokenObj> outTokenList2 = inTokenList * .Select(token => XmlHtmlHandler.Process(token, debugFlag)) * .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)).ToList(); * foreach(var item in outTokenList2) { * Console.WriteLine(item.GetTokenStr()); * } * List<TokenObj> outTokenList3 = inTokenList * .Select(token => XmlHtmlHandler.Process(token, debugFlag)) * .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)) * .SelectMany(token => TextObj.FlatTokenToArrayList(token)).ToList(); * foreach(var item in outTokenList3) { * Console.WriteLine(item.GetTokenStr()); * } * * * List<TokenObj> outTokenList = inTokenList * .Select(token => XmlHtmlHandler.Process(token, debugFlag)) * .Select(token => EndingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)) * .Select(token => LeadingPuncSplitter.Process(token, ndMaxSplitNo, debugFlag)) * .Select(token => LeadingDigitSplitter.Process(token, debugFlag)) * .Select(token => EndingDigitSplitter.Process(token, debugFlag)) * .Select(token => InformalExpHandler.Process(token, infExpMap, debugFlag)).ToList(); */ return(outTokenList); }
// public method // Use: for loop, the latest and greatest implementation // original implementation with for loop, To be deleted // the core of spell-correction, include split // inTokenList is the whole text public static List <TokenObj> Process(List <TokenObj> inTokenList, CSpellApi cSpellApi, bool debugFlag) { DebugPrint.PrintProcess("3-4. NonWord-Split & 1To1", debugFlag); DebugPrint.PrintInText(TextObj.TokenListToText(inTokenList), debugFlag); // init the output TokenList List <TokenObj> outTokenList = new List <TokenObj>(); // process: go through each token for detection and correction // for the 1-to-1 and split correction int tarPos = 0; // the position of the tokenObj in the inTokenList // remove space token from the list List <TokenObj> nonSpaceTokenList = TextObj.GetNonSpaceTokenObjList(inTokenList); // use the inTokenList to keep the same spcae token TokenObj outTokenObj = null; int maxLegitTokenLength = cSpellApi.GetMaxLegitTokenLength(); foreach (TokenObj tokenObj in inTokenList) { /// <summary> /// no context /// TokenObj outTokenObj = SpellCorrector.GetCorrectTerm(tokenObj, /// cSpellApi, debugFlag); /// /// </summary> // skip empty space tokens and long tokens // SCR-3, use legit token if (tokenObj.IsLegitToken(maxLegitTokenLength) == true) { // correct term is the highest ranked candidate outTokenObj = NonWordCorrector.GetCorrectTerm(tokenObj, cSpellApi, debugFlag, tarPos, nonSpaceTokenList); // used tarPos for context module tarPos++; } else { outTokenObj = tokenObj; } // add the corrected tokenObj to the output token list // use FlatMap because there might be a split Split1To1Corrector.AddSplit1To1Correction(outTokenList, outTokenObj); } return(outTokenList); }
private static void TestProcess(CSpellApi cSpellApi) { // init // all lowerCase string inText = "hotflashes and knowaboutare not forr playsure."; List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); bool debugFlag = false; // process List <TokenObj> outTokenList = Process(inTokenList, cSpellApi, debugFlag); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("------ GetCorrection by Process( ) ------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); // print out operation details Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList)); }
private static void TestProcess(CSpellApi cSpellApi) { // init // all lowerCase string inText = "She had problems dur ing her pregnancies. That is a dis appoint ment. Good!"; // test process: must use ArrayList<TextObj> List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); bool debugFlag = false; // process List <TokenObj> outTokenList = Process(inTokenList, cSpellApi, debugFlag); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("------ GetCorrection by Process( ) ------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); // print out operation details Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList)); }
// private method private static void TestProcess(CSpellApi cSpellApi) { // init // test non-word, one-to-one, split, and merge correction, all lowerCase string inText = "hotflashes and knowaboutare not forr playsure dur ing my disa ppoint ment."; // test process: must use ArrayList<TextObj> List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); bool debugFlag = false; // process List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi, debugFlag); // result string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("------ GetCorrection by Process( ) ------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); // print out operation details Console.WriteLine(TextObj.TokenListToOperationDetailStr(outTokenList)); }
// privat methods private static void Test(CSpellApi cSpellApi) { Console.WriteLine("----- Test Pre-Correction Text: -----"); string inText = "We cant spel ACHindex 987Pfimbria dianosed."; //CSpellApi cSpellApi = new CSpellApi(configFile); List <TokenObj> inTokenList = TextObj.TextToTokenList(inText); List <TokenObj> outTokenList = ProcessByTokenObj(inTokenList, cSpellApi); string outText = TextObj.TokenListToText(outTokenList); // print out Console.WriteLine("--------------------"); Console.WriteLine("In: [" + inText + "]"); Console.WriteLine("Out: [" + outText + "]"); Console.WriteLine("----- Details -----------"); int index = 0; foreach (TokenObj tokenObj in outTokenList) { Console.WriteLine(index + "|" + tokenObj.ToString()); index++; } }