/// <summary>
        /// This method splits the input word by adding a space after ending
        /// punctuation.  The input must be single word (no space).
        /// The process method splits the inWord by adding space(s) after endingPunc.
        /// Current algorithm can only handle max. up to 3 endignPuncs.
        /// One in each component of coreTermObj: coreTerm, prefix, and suffix.
        /// - prefix: leading str with punc|spac|number
        /// - coreterm: = the original str - prefix - suffix
        /// - suffix: ending str with punc|space|number
        /// This can be improved by using recursive algorithm in the coreTerm.
        /// For example: "ankle,before.The" in 15737.txt will be split twice in
        /// recursive algorithm.
        /// </summary>
        /// <param name="inWord">  the input token (single word)
        /// </param>
        /// <returns>   the splited word. </returns>
        public static string Process(string inWord)
        {
            string outWord   = inWord;
            bool   debugFlag = false;
            // eProcess: check if can skip
            int ctType = CoreTermUtil.CT_TYPE_SPACE_PUNC_DIGIT;

            if (IsQualified(inWord) == true)
            {
                // 0. convert to coreTerm object
                bool        splitFlag = false;
                CoreTermObj cto       = new CoreTermObj(inWord, ctType);
                // 1. update coreTerm
                string inCoreTerm     = cto.GetCoreTerm();
                string lastEndingPunc = FindLastEndingPunc(inCoreTerm);
                // add a space after the last endingPunc
                if (!string.ReferenceEquals(lastEndingPunc, null))
                {
                    // get the splitObj and then the split string
                    string outCoreTerm = EndingPunc.GetSplitStr(inCoreTerm, lastEndingPunc);
                    cto.SetCoreTerm(outCoreTerm);
                    splitFlag = true;
                }
                // 2. update the prefix when it ends with a endingPunc
                // prefix contains punc and numbers
                string prefix = cto.GetPrefix();
                if ((prefix.Length != 0) && (EndsWithEndingPunc(prefix) == true))                   // ends with endingPunc
                {
                    prefix = prefix + GlobalVars.SPACE_STR;
                    cto.SetPrefix(prefix);
                    splitFlag = true;
                }
                // 3. update the suffix and add a space after the last endingPunc
                // suffix contians punctuation and numbers
                string suffix = cto.GetSuffix();
                if ((suffix.Length != 0) && (ContainsEndingPunc(suffix) == true) && (IsPureEndingPunc(suffix) == false))                   // can't be pure endingPuncs
                // add space after the last endingPunc
                {
                    string lastEndingPunc2 = FindLastEndingPunc(suffix);
                    if (!string.ReferenceEquals(lastEndingPunc2, null))
                    {
                        // get the splitObj and then the split string
                        string outSuffix = EndingPunc.GetSplitStr(suffix, lastEndingPunc2);
                        cto.SetSuffix(outSuffix);
                        splitFlag = true;
                    }
                }
                // update outWord
                if (splitFlag == true)
                {
                    outWord = cto.ToString();
                }
            }
            return(outWord);
        }
示例#2
0
        private static void TestExceptionCA()
        {
            Console.WriteLine("----- Ending Punc Exception: Comma -----");
            List <string> inWordList = new List <string>();

            inWordList.Add("50,000");
            inWordList.Add("1,234,567");
            inWordList.Add("123");
            inWordList.Add("12,34");
            foreach (string inWord in inWordList)
            {
                Console.WriteLine("- IsException(" + inWord + "): " + EndingPunc.IsException(inWord, ENDING_CA));
            }
        }
示例#3
0
        private static void TestExceptionEM()
        {
            Console.WriteLine("----- Ending Punc Exception: Exclamation Mark -----");
            List <string> inWordList = new List <string>();

            inWordList.Add("ulcers!'");
            inWordList.Add("ulcers!\"");
            inWordList.Add("ulcers!]");
            inWordList.Add("XX!'test");
            foreach (string inWord in inWordList)
            {
                Console.WriteLine("- IsException(" + inWord + "): " + EndingPunc.IsException(inWord, ENDING_EM));
            }
        }
示例#4
0
        private static void TestExceptionQM()
        {
            Console.WriteLine("----- Ending Punc Exception: Question Mark -----");
            List <string> inWordList = new List <string>();

            inWordList.Add("ulcers?'");             // 12769.txt
            inWordList.Add("ulcers?\"");
            inWordList.Add("ulcers?]");
            inWordList.Add("XX?'test");
            foreach (string inWord in inWordList)
            {
                Console.WriteLine("- IsException(" + inWord + "): " + EndingPunc.IsException(inWord, ENDING_QM));
            }
        }
示例#5
0
        private static void TestExceptionP()
        {
            Console.WriteLine("----- Ending Punc Exception: period -----");
            List <string> inWordPList = new List <string>();

            // exception 1
            inWordPList.Add("Dr.s");
            inWordPList.Add("Mr.s");
            // exception 2
            inWordPList.Add("16q22.1");
            inWordPList.Add("123.2");
            inWordPList.Add("123.234.4567");
            inWordPList.Add("1c3.2d4.4e6");
            inWordPList.Add("123.23a4.456");
            inWordPList.Add("123a.234.456");
            // exception 3
            inWordPList.Add("D.C.A.B.");
            inWordPList.Add("D.C.A.B");
            inWordPList.Add("d.c.a.");
            inWordPList.Add("d.c.a");
            inWordPList.Add("D.c");
            inWordPList.Add("D.CC.A.B.");
            inWordPList.Add("DD.C.A.B.");
            inWordPList.Add("d.1.a.");
            inWordPList.Add("D.123.A.B.");
            // exception 4
            inWordPList.Add("St.-John");
            inWordPList.Add("123.-John");
            inWordPList.Add("#$.-John");
            inWordPList.Add("St.$%^John");
            inWordPList.Add("St.John");
            inWordPList.Add("St.J.");
            inWordPList.Add("Test...123");
            foreach (string inWordP in inWordPList)
            {
                Console.WriteLine("- IsException(" + inWordP + "): " + EndingPunc.IsException(inWordP, ENDING_P));
            }
        }