Пример #1
0
        /// <summary>
        /// Further processing of a string
        /// </summary>
        /// <param name="words">
        /// Collection that new word(s) will be added in
        /// </param>
        /// <param name="prefix">
        /// prefix come with the string
        /// </param>
        /// <param name="word">
        /// A string that may be a real word or have leading or tailing
        /// special character
        /// </param>
        /// <param name="suffix">
        /// suffix comes with the string.
        /// </param>
        private static void ProcessWord(WordsCollection words,
                                        string prefix, string word, string suffix)
        {
            // the passed in word may have leading special
            // characters such as '(', '"' etc or tailing
            // punctuations. We need to sort this out.
            var length = word.Length;

            if (length == 1)
            {
                words.Add(new Word(word, prefix, suffix));
            }
            else if (!char.IsLetterOrDigit(word[0]))
            {
                // it is some kind of special character in the first place
                // report it separately
                words.Add(new Word(word[0].ToString(), prefix, ""));
                words.Add(new Word(word.Substring(1), "", suffix));
            }
            else if (char.IsPunctuation(word[length - 1]))
            {
                // there is a end punctuation
                words.Add(new Word(word.Substring(0, length - 1), prefix, ""));
                words.Add(new Word(word[length - 1].ToString(), "", suffix));
            }
            else
            {
                // it is a real word(hope so)
                words.Add(new Word(word, prefix, suffix));
            }
        }
Пример #2
0
 /// <summary>
 /// Copy this WordsCollection to another one
 /// starting at the specified index position
 /// </summary>
 /// <param name="col" type="WordsCollection">
 /// WordsCollection to be copied to
 /// </param>
 /// <param name="index" type="integer">
 /// Starting index to begin copy operations
 /// </param>
 public void CopyTo(WordsCollection col, int index)
 {
     for (var i = index; i < List.Count; i++)
     {
         col.Add(this[i]);
     }
 }
Пример #3
0
        public MergeProcessor(string original, string modified)
        {
            // parse the passed in string to words
            // collections
            _original = HtmlTextParser.Parse(original);
            _modified = HtmlTextParser.Parse(modified);

            // for hold the forward searching front-line
            // in previous searching loop
            _fwdVector = new IntVector(_original.Count + _modified.Count);

            // for hold the backward searching front-line
            // in the previous seaching loop
            _bwdVector = new IntVector(_original.Count + _modified.Count);
        }
Пример #4
0
        /// <summary>
        /// Static method that parses the passed-in string into
        /// Words collection
        /// </summary>
        /// <param name="s">
        /// String
        /// </param>
        /// <returns>
        /// Words Collection
        /// </returns>
        public static WordsCollection Parse(string s)
        {
            var curPos = 0;
            var prefix = string.Empty;
            var suffix = string.Empty;
            var words  = new WordsCollection();

            while (curPos < s.Length)
            {
                // eat the leading or tailing white spaces
                var prevPos = curPos;
                while (curPos < s.Length &&
                       (char.IsControl(s[curPos]) ||
                        char.IsWhiteSpace(s[curPos])))
                {
                    curPos++;
                }
                prefix += s.Substring(prevPos, curPos - prevPos);

                if (curPos == s.Length)
                {
                    // it is possible that there are
                    // something in the prefix
                    if (prefix != string.Empty)
                    {
                        // report a empty word with prefix.
                        words.Add(new Word("", prefix, ""));
                    }
                    break;
                }

                // we have 3 different cases here,
                // 1) if the string starts with '<', we assume
                //    that it is a html tag which will be put
                //    into prefix.
                // 2) starts with '&', we need to check if it is
                //    "&nbsp;" or "&#xxx;". If it is the former,
                //    we treat it as prefix and if it is latter,
                //    we treat it as a word.
                // 3) a string that may be a real word or a set
                //    of words separated by "&nbsp;" or may have
                //    leading special character or tailing
                //    punctuation.
                //
                // Another possible case that is too complicated
                // or expensive to handle is that some special
                // characters are embeded inside the word with
                // no space separation
                if (s[curPos] == '<')
                {
                    // it is a html tag, consume it
                    // as prefix.
                    prevPos = curPos;
                    while (s[curPos] != '>' && curPos < s.Length)
                    {
                        curPos++;
                    }
                    prefix += s.Substring(prevPos, curPos - prevPos + 1);

                    if (curPos == s.Length)
                    {
                        // if we come to this point, it means
                        // the html tag is not closed. Anyway,
                        // we are not validating html, so just
                        // report a empty word with prefix.
                        words.Add(new Word("", prefix, ""));
                        break;
                    }
                    // curPos is pointing to '>', move
                    // it to next.
                    curPos++;
                    if (curPos == s.Length)
                    {
                        // the html tag is closed but nothing more
                        // behind, so report a empty word with prefix.
                        words.Add(new Word("", prefix, ""));
                        break;
                    }
                }
                else
                {
                    string word;
                    if (s[curPos] == '&')
                    {
                        prevPos = curPos;

                        // case for html whitespace
                        if (curPos + 6 < s.Length &&
                            s.Substring(prevPos, 6) == "&nbsp;")
                        {
                            prefix += "&nbsp;";
                            curPos += 6;
                            continue;
                        }

                        // case for special character like "&#123;" etc
                        var pattern = @"&#[0-9]{3};";
                        var r       = new Regex(pattern);

                        if (curPos + 6 < s.Length &&
                            r.IsMatch(s.Substring(prevPos, 6)))
                        {
                            words.Add(new Word(s.Substring(prevPos, 6), prefix, ""));
                            prefix  = string.Empty;
                            curPos += 6;
                            continue;
                        }

                        // case for special character like "&#12;" etc
                        pattern = @"&#[0-9]{2};";
                        r       = new Regex(pattern);
                        if (curPos + 5 < s.Length &&
                            r.IsMatch(s.Substring(prevPos, 5)))
                        {
                            words.Add(new Word(s.Substring(prevPos, 5), prefix, ""));
                            prefix  = string.Empty;
                            curPos += 5;
                            continue;
                        }

                        // can't think of anything else that is special,
                        // have to treat it as a '&' leaded word. Hope
                        // it is just single '&' for and in meaning.
                        prevPos = curPos;
                        while (curPos < s.Length &&
                               !char.IsControl(s[curPos]) &&
                               !char.IsWhiteSpace(s[curPos]) &&
                               s[curPos] != '<')
                        {
                            curPos++;
                        }
                        word = s.Substring(prevPos, curPos - prevPos);

                        // eat the following witespace as suffix
                        prevPos = curPos;
                        while (curPos < s.Length &&
                               (char.IsControl(s[curPos]) ||
                                char.IsWhiteSpace(s[curPos])))
                        {
                            curPos++;
                        }
                        suffix += s.Substring(prevPos, curPos - prevPos);

                        words.Add(new Word(word, prefix, suffix));
                        prefix = string.Empty;
                        suffix = string.Empty;
                    }
                    else
                    {
                        // eat the word
                        prevPos = curPos;
                        while (curPos < s.Length &&
                               !char.IsControl(s[curPos]) &&
                               !char.IsWhiteSpace(s[curPos]) &&
                               s[curPos] != '<' &&
                               s[curPos] != '&')
                        {
                            curPos++;
                        }
                        word = s.Substring(prevPos, curPos - prevPos);

                        // if there are newlines or spaces follow
                        // the word, consume it as suffix
                        prevPos = curPos;
                        while (curPos < s.Length &&
                               (char.IsControl(s[curPos]) ||
                                char.IsWhiteSpace(s[curPos])))
                        {
                            curPos++;
                        }
                        suffix = s.Substring(prevPos, curPos - prevPos);
                        ProcessWord(words, prefix, word, suffix);
                        prefix = string.Empty;
                        suffix = string.Empty;
                    }
                }
            }
            return(words);
        }
Пример #5
0
 /// <summary>
 /// Overloaded. Copy this WordsCollection to another one
 /// starting at the index zero
 /// </summary>
 /// <param name="col" type="WordCollection">
 /// WordsCollection to copy to
 /// </param>
 public void CopyTo(WordsCollection col)
 {
     CopyTo(col, 0);
 }