Beispiel #1
0
 /// <summary>
 /// Copy this WordsCollection to another one
 ///     starting at the specified index position
 /// </summary>
 /// <param name="col" type="WordsCollection">
 /// WordsCollection to be copied to
 /// </param>
 /// <param name="index" type="integer">
 /// Starting index to begin copy operations
 /// </param>
 public void CopyTo(WordsCollection col, int index)
 {
     for (var i = index; i < this.List.Count; i++)
     {
         col.Add(this[i]);
     }
 }
Beispiel #2
0
        /// <summary>
        /// Further processing of a string
        /// </summary>
        /// <param name="words">
        /// Collection that new word(s) will be added in
        /// </param>
        /// <param name="prefix">
        /// prefix come with the string
        /// </param>
        /// <param name="word">
        /// A string that may be a real word or have leading or tailing
        ///     special character
        /// </param>
        /// <param name="suffix">
        /// suffix comes with the string.
        /// </param>
        private static void ProcessWord(WordsCollection words, string prefix, string word, string suffix)
        {
            // the passed in word may have leading special
            // characters such as '(', '"' etc or tailing
            // punctuations. We need to sort this out.
            var length = word.Length;

            if (length == 1)
            {
                words.Add(new Word(word, prefix, suffix));
            }
            else if (!char.IsLetterOrDigit(word[0]))
            {
                // it is some kind of special character in the first place
                // report it separately
                words.Add(new Word(word[0].ToString(), prefix, string.Empty));
                words.Add(new Word(word.Substring(1), string.Empty, suffix));
                return;
            }
            else if (char.IsPunctuation(word[length - 1]))
            {
                // there is a end punctuation
                words.Add(new Word(word.Substring(0, length - 1), prefix, string.Empty));
                words.Add(new Word(word[length - 1].ToString(), string.Empty, suffix));
            }
            else
            {
                // it is a real word(hope so)
                words.Add(new Word(word, prefix, suffix));
            }
        }
Beispiel #3
0
        /// <summary>
        /// Initializes a new instance of the <see cref="Merger"/> class.
        /// </summary>
        /// <param name="original">
        /// The original.
        /// </param>
        /// <param name="modified">
        /// The modified.
        /// </param>
        public Merger(string original, string modified)
        {
            // parse the passed in string to words
            // collections
            this.original = HtmlTextParser.parse(original);
            this.modified = HtmlTextParser.parse(modified);

            // for hold the forward searching front-line
            // in previous searching loop
            this.fwdVector = new IntVector(this.original.Count + this.modified.Count);

            // for hold the backward searching front-line
            // in the previous seaching loop
            this.bwdVector = new IntVector(this.original.Count + this.modified.Count);
        }
Beispiel #4
0
        /// <summary>
        /// Static method that parses the passed-in string into
        ///     Words collection
        /// </summary>
        /// <param name="s">
        /// String
        /// </param>
        /// <returns>
        /// Words Collection
        /// </returns>
        public static WordsCollection parse(string s)
        {
            var curPos = 0;
            var prefix = string.Empty;
            var suffix = string.Empty;
            var words  = new WordsCollection();

            while (curPos < s.Length)
            {
                // eat the leading or tailing white spaces
                int prevPos = curPos;
                while (curPos < s.Length && (char.IsControl(s[curPos]) || char.IsWhiteSpace(s[curPos])))
                {
                    curPos++;
                }

                prefix += s.Substring(prevPos, curPos - prevPos);

                if (curPos == s.Length)
                {
                    // it is possible that there are
                    // something in the prefix
                    if (prefix != string.Empty)
                    {
                        // report a empty word with prefix.
                        words.Add(new Word(string.Empty, prefix, string.Empty));
                    }

                    break;
                }

                // we have 3 different cases here,
                // 1) if the string starts with '<', we assume
                // that it is a html tag which will be put
                // into prefix.
                // 2) starts with '&', we need to check if it is
                // "&nbsp;" or "&#xxx;". If it is the former,
                // we treat it as prefix and if it is latter,
                // we treat it as a word.
                // 3) a string that may be a real word or a set
                // of words separated by "&nbsp;" or may have
                // leading special character or tailing
                // punctuation.
                // Another possible case that is too complicated
                // or expensive to handle is that some special
                // characters are embeded inside the word with
                // no space separation
                if (s[curPos] == '<')
                {
                    // it is a html tag, consume it
                    // as prefix.
                    prevPos = curPos;
                    while (s[curPos] != '>' && curPos < s.Length)
                    {
                        curPos++;
                    }

                    prefix += s.Substring(prevPos, curPos - prevPos + 1);

                    if (curPos == s.Length)
                    {
                        // if we come to this point, it means
                        // the html tag is not closed. Anyway,
                        // we are not validating html, so just
                        // report a empty word with prefix.
                        words.Add(new Word(string.Empty, prefix, string.Empty));
                        break;
                    }

                    // curPos is pointing to '>', move
                    // it to next.
                    curPos++;
                    if (curPos == s.Length)
                    {
                        // the html tag is closed but nothing more
                        // behind, so report a empty word with prefix.
                        words.Add(new Word(string.Empty, prefix, string.Empty));
                        break;
                    }

                    continue;
                }

                string word;
                if (s[curPos] == '&')
                {
                    prevPos = curPos;

                    // case for html whitespace
                    if (curPos + 6 < s.Length && s.Substring(prevPos, 6) == "&nbsp;")
                    {
                        prefix += "&nbsp;";
                        curPos += 6;
                        continue;
                    }

                    // case for special character like "&#123;" etc
                    var pattern = @"&#[0-9]{3};";
                    var r       = new Regex(pattern);

                    if (curPos + 6 < s.Length && r.IsMatch(s.Substring(prevPos, 6)))
                    {
                        words.Add(new Word(s.Substring(prevPos, 6), prefix, string.Empty));
                        prefix  = string.Empty;
                        curPos += 6;
                        continue;
                    }

                    // case for special character like "&#12;" etc
                    pattern = @"&#[0-9]{2};";
                    r       = new Regex(pattern);
                    if (curPos + 5 < s.Length && r.IsMatch(s.Substring(prevPos, 5)))
                    {
                        words.Add(new Word(s.Substring(prevPos, 5), prefix, string.Empty));
                        prefix  = string.Empty;
                        curPos += 5;
                        continue;
                    }

                    // can't think of anything else that is special,
                    // have to treat it as a '&' leaded word. Hope
                    // it is just single '&' for and in meaning.
                    prevPos = curPos;
                    while (curPos < s.Length && !char.IsControl(s[curPos]) && !char.IsWhiteSpace(s[curPos]) &&
                           s[curPos] != '<')
                    {
                        curPos++;
                    }

                    word = s.Substring(prevPos, curPos - prevPos);

                    // eat the following witespace as suffix
                    prevPos = curPos;
                    while (curPos < s.Length && (char.IsControl(s[curPos]) || char.IsWhiteSpace(s[curPos])))
                    {
                        curPos++;
                    }

                    suffix += s.Substring(prevPos, curPos - prevPos);

                    words.Add(new Word(word, prefix, suffix));
                    prefix = string.Empty;
                    suffix = string.Empty;
                }
                else
                {
                    // eat the word
                    prevPos = curPos;
                    while (curPos < s.Length && !char.IsControl(s[curPos]) && !char.IsWhiteSpace(s[curPos]) &&
                           s[curPos] != '<' && s[curPos] != '&')
                    {
                        curPos++;
                    }

                    word = s.Substring(prevPos, curPos - prevPos);

                    // if there are newlines or spaces follow
                    // the word, consume it as suffix
                    prevPos = curPos;
                    while (curPos < s.Length && (char.IsControl(s[curPos]) || char.IsWhiteSpace(s[curPos])))
                    {
                        curPos++;
                    }

                    suffix = s.Substring(prevPos, curPos - prevPos);
                    ProcessWord(words, prefix, word, suffix);
                    prefix = string.Empty;
                    suffix = string.Empty;
                }
            }

            return(words);
        }
Beispiel #5
0
 /// <summary>
 /// Overloaded. Copy this WordsCollection to another one
 ///     starting at the index zero
 /// </summary>
 /// <param name="col" type="WordCollection">
 /// WordsCollection to copy to
 /// </param>
 public void CopyTo(WordsCollection col)
 {
     this.CopyTo(col, 0);
 }
        /// <summary>
        /// Further processing of a string
        /// </summary>
        /// <param name="words">
        /// Collection that new word(s) will be added in
        /// </param>
        /// <param name="prefix">
        /// prefix come with the string
        /// </param>
        /// <param name="word">
        /// A string that may be a real word or have leading or tailing
        ///     special character
        /// </param>
        /// <param name="suffix">
        /// suffix comes with the string.
        /// </param>
        private static void ProcessWord(WordsCollection words, string prefix, string word, string suffix)
        {
            // the passed in word may have leading special
            // characters such as '(', '"' etc or tailing
            // punctuations. We need to sort this out.
            var length = word.Length;

            if (length == 1)
            {
                words.Add(new Word(word, prefix, suffix));
            }
            else if (!char.IsLetterOrDigit(word[0]))
            {
                // it is some kind of special character in the first place
                // report it separately
                words.Add(new Word(word[0].ToString(), prefix, string.Empty));
                words.Add(new Word(word.Substring(1), string.Empty, suffix));
                return;
            }
            else if (char.IsPunctuation(word[length - 1]))
            {
                // there is a end punctuation
                words.Add(new Word(word.Substring(0, length - 1), prefix, string.Empty));
                words.Add(new Word(word[length - 1].ToString(), string.Empty, suffix));
            }
            else
            {
                // it is a real word(hope so)
                words.Add(new Word(word, prefix, suffix));
            }
        }
        /// <summary>
        /// Static method that parses the passed-in string into
        ///     Words collection
        /// </summary>
        /// <param name="s">
        /// String
        /// </param>
        /// <returns>
        /// Words Collection
        /// </returns>
        public static WordsCollection parse(string s)
        {
            var curPos = 0;
            var prefix = string.Empty;
            var suffix = string.Empty;
            var words = new WordsCollection();

            while (curPos < s.Length)
            {
                // eat the leading or tailing white spaces
                int prevPos = curPos;
                while (curPos < s.Length && (char.IsControl(s[curPos]) || char.IsWhiteSpace(s[curPos])))
                {
                    curPos++;
                }

                prefix += s.Substring(prevPos, curPos - prevPos);

                if (curPos == s.Length)
                {
                    // it is possible that there are
                    // something in the prefix
                    if (prefix != string.Empty)
                    {
                        // report a empty word with prefix.
                        words.Add(new Word(string.Empty, prefix, string.Empty));
                    }

                    break;
                }

                // we have 3 different cases here,
                // 1) if the string starts with '<', we assume
                // that it is a html tag which will be put
                // into prefix.
                // 2) starts with '&', we need to check if it is
                // "&nbsp;" or "&#xxx;". If it is the former,
                // we treat it as prefix and if it is latter,
                // we treat it as a word.
                // 3) a string that may be a real word or a set
                // of words separated by "&nbsp;" or may have
                // leading special character or tailing
                // punctuation.
                // Another possible case that is too complicated
                // or expensive to handle is that some special
                // characters are embeded inside the word with
                // no space separation
                if (s[curPos] == '<')
                {
                    // it is a html tag, consume it
                    // as prefix.
                    prevPos = curPos;
                    while (s[curPos] != '>' && curPos < s.Length)
                    {
                        curPos++;
                    }

                    prefix += s.Substring(prevPos, curPos - prevPos + 1);

                    if (curPos == s.Length)
                    {
                        // if we come to this point, it means
                        // the html tag is not closed. Anyway,
                        // we are not validating html, so just
                        // report a empty word with prefix.
                        words.Add(new Word(string.Empty, prefix, string.Empty));
                        break;
                    }

                    // curPos is pointing to '>', move
                    // it to next.
                    curPos++;
                    if (curPos == s.Length)
                    {
                        // the html tag is closed but nothing more
                        // behind, so report a empty word with prefix.
                        words.Add(new Word(string.Empty, prefix, string.Empty));
                        break;
                    }

                    continue;
                }

                string word;
                if (s[curPos] == '&')
                {
                    prevPos = curPos;

                    // case for html whitespace
                    if (curPos + 6 < s.Length && s.Substring(prevPos, 6) == "&nbsp;")
                    {
                        prefix += "&nbsp;";
                        curPos += 6;
                        continue;
                    }

                    // case for special character like "&#123;" etc
                    var pattern = @"&#[0-9]{3};";
                    var r = new Regex(pattern);

                    if (curPos + 6 < s.Length && r.IsMatch(s.Substring(prevPos, 6)))
                    {
                        words.Add(new Word(s.Substring(prevPos, 6), prefix, string.Empty));
                        prefix = string.Empty;
                        curPos += 6;
                        continue;
                    }

                    // case for special character like "&#12;" etc
                    pattern = @"&#[0-9]{2};";
                    r = new Regex(pattern);
                    if (curPos + 5 < s.Length && r.IsMatch(s.Substring(prevPos, 5)))
                    {
                        words.Add(new Word(s.Substring(prevPos, 5), prefix, string.Empty));
                        prefix = string.Empty;
                        curPos += 5;
                        continue;
                    }

                    // can't think of anything else that is special,
                    // have to treat it as a '&' leaded word. Hope
                    // it is just single '&' for and in meaning.
                    prevPos = curPos;
                    while (curPos < s.Length && !char.IsControl(s[curPos]) && !char.IsWhiteSpace(s[curPos]) &&
                           s[curPos] != '<')
                    {
                        curPos++;
                    }

                    word = s.Substring(prevPos, curPos - prevPos);

                    // eat the following witespace as suffix
                    prevPos = curPos;
                    while (curPos < s.Length && (char.IsControl(s[curPos]) || char.IsWhiteSpace(s[curPos])))
                    {
                        curPos++;
                    }

                    suffix += s.Substring(prevPos, curPos - prevPos);

                    words.Add(new Word(word, prefix, suffix));
                    prefix = string.Empty;
                    suffix = string.Empty;
                }
                else
                {
                    // eat the word
                    prevPos = curPos;
                    while (curPos < s.Length && !char.IsControl(s[curPos]) && !char.IsWhiteSpace(s[curPos]) &&
                           s[curPos] != '<' && s[curPos] != '&')
                    {
                        curPos++;
                    }

                    word = s.Substring(prevPos, curPos - prevPos);

                    // if there are newlines or spaces follow
                    // the word, consume it as suffix
                    prevPos = curPos;
                    while (curPos < s.Length && (char.IsControl(s[curPos]) || char.IsWhiteSpace(s[curPos])))
                    {
                        curPos++;
                    }

                    suffix = s.Substring(prevPos, curPos - prevPos);
                    ProcessWord(words, prefix, word, suffix);
                    prefix = string.Empty;
                    suffix = string.Empty;
                }
            }

            return words;
        }
        /// <summary>
        /// Initializes a new instance of the <see cref="Merger"/> class. 
        /// </summary>
        /// <param name="original">
        /// The original.
        /// </param>
        /// <param name="modified">
        /// The modified.
        /// </param>
        public Merger(string original, string modified)
        {
            // parse the passed in string to words
            // collections
            this.original = HtmlTextParser.parse(original);
            this.modified = HtmlTextParser.parse(modified);

            // for hold the forward searching front-line
            // in previous searching loop
            this.fwdVector = new IntVector(this.original.Count + this.modified.Count);

            // for hold the backward searching front-line
            // in the previous seaching loop
            this.bwdVector = new IntVector(this.original.Count + this.modified.Count);
        }
 /// <summary>
 /// Overloaded. Copy this WordsCollection to another one 
 ///     starting at the index zero
 /// </summary>
 /// <param name="col" type="WordCollection">
 /// WordsCollection to copy to
 /// </param>
 public void CopyTo(WordsCollection col)
 {
     this.CopyTo(col, 0);
 }
Beispiel #10
0
 /// <summary>
 /// Copy this WordsCollection to another one 
 ///     starting at the specified index position
 /// </summary>
 /// <param name="col" type="WordsCollection">
 /// WordsCollection to be copied to
 /// </param>
 /// <param name="index" type="integer">
 /// Starting index to begin copy operations
 /// </param>
 public void CopyTo(WordsCollection col, int index)
 {
     for (var i = index; i < this.List.Count; i++)
     {
         col.Add(this[i]);
     }
 }