Beispiel #1
0
        /// <summary>
        /// Further processing of a string
        /// </summary>
        /// <param name="words">
        /// Collection that new word(s) will be added in
        /// </param>
        /// <param name="prefix">
        /// prefix come with the string
        /// </param>
        /// <param name="word">
        /// A string that may be a real word or have leading or tailing
        /// special character
        /// </param>
        /// <param name="suffix">
        /// suffix comes with the string.
        /// </param>
        private static void processWord(WordsCollection words,
                                        string prefix, string word, string suffix)
        {
            // the passed in word may have leading special
            // characters such as '(', '"' etc or tailing
            // punctuations. We need to sort this out.
            int length = word.Length;

            if (length == 1)
            {
                words.Add(new Word(word, prefix, suffix));
            }
            else if (!char.IsLetterOrDigit(word[0]))
            {
                // it is some kind of special character in the first place
                // report it separately
                words.Add(new Word(word[0].ToString(), prefix, ""));
                words.Add(new Word(word.Substring(1), "", suffix));
                return;
            }
            else if (char.IsPunctuation(word[length - 1]))
            {
                // there is a end punctuation
                words.Add(new Word(word.Substring(0, length - 1), prefix, ""));
                words.Add(new Word(word[length - 1].ToString(), "", suffix));
            }
            else
            {
                // it is a real word(hope so)
                words.Add(new Word(word, prefix, suffix));
            }
        }
Beispiel #2
0
 /// <summary>
 /// Copy this WordsCollection to another one
 /// starting at the specified index position
 /// </summary>
 /// <param name="col" type="WordsCollection">
 /// WordsCollection to be copied to
 /// </param>
 /// <param name="index" type="integer">
 /// Starting index to begin copy operations
 /// </param>
 public void CopyTo(WordsCollection col, int index)
 {
     for (int i = index; i < List.Count; i++)
     {
         col.Add(this[i]);
     }
 }
Beispiel #3
0
        public Merger(string original, string modified)
        {
            // parse the passed in string to words
            // collections
            _original = HtmlTextParser.parse(original);
            _modified = HtmlTextParser.parse(modified);

            // for hold the forward searching front-line
            // in previous searching loop
            fwdVector = new IntVector(_original.Count + _modified.Count);

            // for hold the backward searching front-line
            // in the previous seaching loop
            bwdVector = new IntVector(_original.Count + _modified.Count);
        }
Beispiel #4
0
        /// <summary>
        /// Static method that parses the passed-in string into
        /// Words collection
        /// </summary>
        /// <param name="s">
        /// String
        /// </param>
        /// <returns>
        /// Words Collection
        /// </returns>
        static public WordsCollection parse(string s)
        {
            int             curPos = 0;
            int             prevPos;
            string          prefix = string.Empty;
            string          suffix = string.Empty;
            string          word   = string.Empty;
            WordsCollection words  = new WordsCollection();

            while (curPos < s.Length)
            {
                // eat the leading or tailing white spaces
                prevPos = curPos;
                while (curPos < s.Length &&
                       (char.IsControl(s[curPos]) ||
                        char.IsWhiteSpace(s[curPos])))
                {
                    curPos++;
                }
                prefix += s.Substring(prevPos, curPos - prevPos);

                if (curPos == s.Length)
                {
                    // it is possible that there are
                    // something in the prefix
                    if (prefix != string.Empty)
                    {
                        // report a empty word with prefix.
                        words.Add(new Word("", prefix, ""));
                    }
                    break;
                }

                // we have 3 different cases here,
                // 1) if the string starts with '<', we assume
                //    that it is a html tag which will be put
                //    into prefix.
                // 2) starts with '&', we need to check if it is
                //    "&nbsp;" or "&#xxx;". If it is the former,
                //    we treat it as prefix and if it is latter,
                //    we treat it as a word.
                // 3) a string that may be a real word or a set
                //    of words separated by "&nbsp;" or may have
                //    leading special character or tailing
                //    punctuation.
                //
                // Another possible case that is too complicated
                // or expensive to handle is that some special
                // characters are embeded inside the word with
                // no space separation
                if (s[curPos] == '<')
                {
                    // it is a html tag, consume it
                    // as prefix.
                    prevPos = curPos;
                    while (s[curPos] != '>' && curPos < s.Length)
                    {
                        curPos++;
                    }
                    prefix += s.Substring(prevPos, curPos - prevPos + 1);

                    if (curPos == s.Length)
                    {
                        // if we come to this point, it means
                        // the html tag is not closed. Anyway,
                        // we are not validating html, so just
                        // report a empty word with prefix.
                        words.Add(new Word("", prefix, ""));
                        break;
                    }
                    // curPos is pointing to '>', move
                    // it to next.
                    curPos++;
                    if (curPos == s.Length)
                    {
                        // the html tag is closed but nothing more
                        // behind, so report a empty word with prefix.
                        words.Add(new Word("", prefix, ""));
                        break;
                    }
                    continue;
                }
                else if (s[curPos] == '&')
                {
                    prevPos = curPos;

                    // case for html whitespace
                    if (curPos + 6 < s.Length &&
                        s.Substring(prevPos, 6) == "&nbsp;")
                    {
                        prefix += "&nbsp;";
                        curPos += 6;
                        continue;
                    }

                    // case for special character like "&#123;" etc
                    string pattern = @"&#[0-9]{3};";
                    Regex  r       = new Regex(pattern);

                    if (curPos + 6 < s.Length &&
                        r.IsMatch(s.Substring(prevPos, 6)))
                    {
                        words.Add(new Word(s.Substring(prevPos, 6), prefix, ""));
                        prefix  = string.Empty;
                        curPos += 6;
                        continue;
                    }

                    // case for special character like "&#12;" etc
                    pattern = @"&#[0-9]{2};";
                    r       = new Regex(pattern);
                    if (curPos + 5 < s.Length &&
                        r.IsMatch(s.Substring(prevPos, 5)))
                    {
                        words.Add(new Word(s.Substring(prevPos, 5), prefix, ""));
                        prefix  = string.Empty;
                        curPos += 5;
                        continue;
                    }

                    // can't think of anything else that is special,
                    // have to treat it as a '&' leaded word. Hope
                    // it is just single '&' for and in meaning.
                    prevPos = curPos;
                    while (curPos < s.Length &&
                           !char.IsControl(s[curPos]) &&
                           !char.IsWhiteSpace(s[curPos]) &&
                           s[curPos] != '<')
                    {
                        curPos++;
                    }
                    word = s.Substring(prevPos, curPos - prevPos);

                    // eat the following witespace as suffix
                    prevPos = curPos;
                    while (curPos < s.Length &&
                           (char.IsControl(s[curPos]) ||
                            char.IsWhiteSpace(s[curPos])))
                    {
                        curPos++;
                    }
                    suffix += s.Substring(prevPos, curPos - prevPos);

                    words.Add(new Word(word, prefix, suffix));
                    prefix = string.Empty;
                    suffix = string.Empty;
                }
                else
                {
                    // eat the word
                    prevPos = curPos;
                    while (curPos < s.Length &&
                           !char.IsControl(s[curPos]) &&
                           !char.IsWhiteSpace(s[curPos]) &&
                           s[curPos] != '<' &&
                           s[curPos] != '&')
                    {
                        curPos++;
                    }
                    word = s.Substring(prevPos, curPos - prevPos);

                    // if there are newlines or spaces follow
                    // the word, consume it as suffix
                    prevPos = curPos;
                    while (curPos < s.Length &&
                           (char.IsControl(s[curPos]) ||
                            char.IsWhiteSpace(s[curPos])))
                    {
                        curPos++;
                    }
                    suffix = s.Substring(prevPos, curPos - prevPos);
                    processWord(words, prefix, word, suffix);
                    prefix = string.Empty;
                    suffix = string.Empty;
                }
            }
            return(words);
        }
Beispiel #5
0
 /// <summary>
 /// Overloaded. Copy this WordsCollection to another one
 /// starting at the index zero
 /// </summary>
 /// <param name="col" type="WordCollection">
 /// WordsCollection to copy to
 /// </param>
 public void CopyTo(WordsCollection col)
 {
     this.CopyTo(col, 0);
 }