/// <summary> /// This method splits the input word by adding a space after ending /// punctuation. The input must be single word (no space). /// The process method splits the inWord by adding space(s) after endingPunc. /// Current algorithm can only handle max. up to 3 endignPuncs. /// One in each component of coreTermObj: coreTerm, prefix, and suffix. /// - prefix: leading str with punc|spac|number /// - coreterm: = the original str - prefix - suffix /// - suffix: ending str with punc|space|number /// This can be improved by using recursive algorithm in the coreTerm. /// For example: "ankle,before.The" in 15737.txt will be split twice in /// recursive algorithm. /// </summary> /// <param name="inWord"> the input token (single word) /// </param> /// <returns> the splited word. </returns> public static string Process(string inWord) { string outWord = inWord; bool debugFlag = false; // eProcess: check if can skip int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT; if (IsQualified(inWord) == true) { // 0. convert to coreTerm object bool splitFlag = false; CoreTermObj cto = new CoreTermObj(inWord, ctType); // 1. update coreTerm string inCoreTerm = cto.GetCoreTerm(); string lastEndingPunc = FindLastEndingPunc(inCoreTerm); // add a space after the last endingPunc if (!string.ReferenceEquals(lastEndingPunc, null)) { // get the splitObj and then the split string string outCoreTerm = EndingPunc.GetSplitStr(inCoreTerm, lastEndingPunc); cto.SetCoreTerm(outCoreTerm); splitFlag = true; } // 2. update the prefix when it ends with a endingPunc // prefix contains punc and numbers string prefix = cto.GetPrefix(); if ((prefix.Length != 0) && (EndsWithEndingPunc(prefix) == true)) // ends with endingPunc { prefix = prefix + GlobalVars.SPACE_STR; cto.SetPrefix(prefix); splitFlag = true; } // 3. update the suffix and add a space after the last endingPunc // suffix contians punctuation and numbers string suffix = cto.GetSuffix(); if ((suffix.Length != 0) && (ContainsEndingPunc(suffix) == true) && (IsPureEndingPunc(suffix) == false)) // can't be pure endingPuncs // add space after the last endingPunc { string lastEndingPunc2 = FindLastEndingPunc(suffix); if (!string.ReferenceEquals(lastEndingPunc2, null)) { // get the splitObj and then the split string string outSuffix = EndingPunc.GetSplitStr(suffix, lastEndingPunc2); cto.SetSuffix(outSuffix); splitFlag = true; } } // update outWord if (splitFlag == true) { outWord = cto.ToString(); } } return(outWord); }
private static void TestExceptionCA() { Console.WriteLine("----- Ending Punc Exception: Comma -----"); List <string> inWordList = new List <string>(); inWordList.Add("50,000"); inWordList.Add("1,234,567"); inWordList.Add("123"); inWordList.Add("12,34"); foreach (string inWord in inWordList) { Console.WriteLine("- IsException(" + inWord + "): " + EndingPunc.IsException(inWord, ENDING_CA)); } }
private static void TestExceptionEM() { Console.WriteLine("----- Ending Punc Exception: Exclamation Mark -----"); List <string> inWordList = new List <string>(); inWordList.Add("ulcers!'"); inWordList.Add("ulcers!\""); inWordList.Add("ulcers!]"); inWordList.Add("XX!'test"); foreach (string inWord in inWordList) { Console.WriteLine("- IsException(" + inWord + "): " + EndingPunc.IsException(inWord, ENDING_EM)); } }
private static void TestExceptionQM() { Console.WriteLine("----- Ending Punc Exception: Question Mark -----"); List <string> inWordList = new List <string>(); inWordList.Add("ulcers?'"); // 12769.txt inWordList.Add("ulcers?\""); inWordList.Add("ulcers?]"); inWordList.Add("XX?'test"); foreach (string inWord in inWordList) { Console.WriteLine("- IsException(" + inWord + "): " + EndingPunc.IsException(inWord, ENDING_QM)); } }
private static void TestExceptionP() { Console.WriteLine("----- Ending Punc Exception: period -----"); List <string> inWordPList = new List <string>(); // exception 1 inWordPList.Add("Dr.s"); inWordPList.Add("Mr.s"); // exception 2 inWordPList.Add("16q22.1"); inWordPList.Add("123.2"); inWordPList.Add("123.234.4567"); inWordPList.Add("1c3.2d4.4e6"); inWordPList.Add("123.23a4.456"); inWordPList.Add("123a.234.456"); // exception 3 inWordPList.Add("D.C.A.B."); inWordPList.Add("D.C.A.B"); inWordPList.Add("d.c.a."); inWordPList.Add("d.c.a"); inWordPList.Add("D.c"); inWordPList.Add("D.CC.A.B."); inWordPList.Add("DD.C.A.B."); inWordPList.Add("d.1.a."); inWordPList.Add("D.123.A.B."); // exception 4 inWordPList.Add("St.-John"); inWordPList.Add("123.-John"); inWordPList.Add("#$.-John"); inWordPList.Add("St.$%^John"); inWordPList.Add("St.John"); inWordPList.Add("St.J."); inWordPList.Add("Test...123"); foreach (string inWordP in inWordPList) { Console.WriteLine("- IsException(" + inWordP + "): " + EndingPunc.IsException(inWordP, ENDING_P)); } }