/// <summary> /// Further processing of a string /// </summary> /// <param name="words"> /// Collection that new word(s) will be added in /// </param> /// <param name="prefix"> /// prefix come with the string /// </param> /// <param name="word"> /// A string that may be a real word or have leading or tailing /// special character /// </param> /// <param name="suffix"> /// suffix comes with the string. /// </param> private static void ProcessWord(WordsCollection words, string prefix, string word, string suffix) { // the passed in word may have leading special // characters such as '(', '"' etc or tailing // punctuations. We need to sort this out. var length = word.Length; if (length == 1) { words.Add(new Word(word, prefix, suffix)); } else if (!char.IsLetterOrDigit(word[0])) { // it is some kind of special character in the first place // report it separately words.Add(new Word(word[0].ToString(), prefix, "")); words.Add(new Word(word.Substring(1), "", suffix)); } else if (char.IsPunctuation(word[length - 1])) { // there is a end punctuation words.Add(new Word(word.Substring(0, length - 1), prefix, "")); words.Add(new Word(word[length - 1].ToString(), "", suffix)); } else { // it is a real word(hope so) words.Add(new Word(word, prefix, suffix)); } }
/// <summary> /// Copy this WordsCollection to another one /// starting at the specified index position /// </summary> /// <param name="col" type="WordsCollection"> /// WordsCollection to be copied to /// </param> /// <param name="index" type="integer"> /// Starting index to begin copy operations /// </param> public void CopyTo(WordsCollection col, int index) { for (var i = index; i < List.Count; i++) { col.Add(this[i]); } }
public MergeProcessor(string original, string modified) { // parse the passed in string to words // collections _original = HtmlTextParser.Parse(original); _modified = HtmlTextParser.Parse(modified); // for hold the forward searching front-line // in previous searching loop _fwdVector = new IntVector(_original.Count + _modified.Count); // for hold the backward searching front-line // in the previous seaching loop _bwdVector = new IntVector(_original.Count + _modified.Count); }
/// <summary> /// Static method that parses the passed-in string into /// Words collection /// </summary> /// <param name="s"> /// String /// </param> /// <returns> /// Words Collection /// </returns> public static WordsCollection Parse(string s) { var curPos = 0; var prefix = string.Empty; var suffix = string.Empty; var words = new WordsCollection(); while (curPos < s.Length) { // eat the leading or tailing white spaces var prevPos = curPos; while (curPos < s.Length && (char.IsControl(s[curPos]) || char.IsWhiteSpace(s[curPos]))) { curPos++; } prefix += s.Substring(prevPos, curPos - prevPos); if (curPos == s.Length) { // it is possible that there are // something in the prefix if (prefix != string.Empty) { // report a empty word with prefix. words.Add(new Word("", prefix, "")); } break; } // we have 3 different cases here, // 1) if the string starts with '<', we assume // that it is a html tag which will be put // into prefix. // 2) starts with '&', we need to check if it is // " " or "&#xxx;". If it is the former, // we treat it as prefix and if it is latter, // we treat it as a word. // 3) a string that may be a real word or a set // of words separated by " " or may have // leading special character or tailing // punctuation. // // Another possible case that is too complicated // or expensive to handle is that some special // characters are embeded inside the word with // no space separation if (s[curPos] == '<') { // it is a html tag, consume it // as prefix. prevPos = curPos; while (s[curPos] != '>' && curPos < s.Length) { curPos++; } prefix += s.Substring(prevPos, curPos - prevPos + 1); if (curPos == s.Length) { // if we come to this point, it means // the html tag is not closed. Anyway, // we are not validating html, so just // report a empty word with prefix. words.Add(new Word("", prefix, "")); break; } // curPos is pointing to '>', move // it to next. curPos++; if (curPos == s.Length) { // the html tag is closed but nothing more // behind, so report a empty word with prefix. words.Add(new Word("", prefix, "")); break; } } else { string word; if (s[curPos] == '&') { prevPos = curPos; // case for html whitespace if (curPos + 6 < s.Length && s.Substring(prevPos, 6) == " ") { prefix += " "; curPos += 6; continue; } // case for special character like "{" etc var pattern = @"&#[0-9]{3};"; var r = new Regex(pattern); if (curPos + 6 < s.Length && r.IsMatch(s.Substring(prevPos, 6))) { words.Add(new Word(s.Substring(prevPos, 6), prefix, "")); prefix = string.Empty; curPos += 6; continue; } // case for special character like "" etc pattern = @"&#[0-9]{2};"; r = new Regex(pattern); if (curPos + 5 < s.Length && r.IsMatch(s.Substring(prevPos, 5))) { words.Add(new Word(s.Substring(prevPos, 5), prefix, "")); prefix = string.Empty; curPos += 5; continue; } // can't think of anything else that is special, // have to treat it as a '&' leaded word. Hope // it is just single '&' for and in meaning. prevPos = curPos; while (curPos < s.Length && !char.IsControl(s[curPos]) && !char.IsWhiteSpace(s[curPos]) && s[curPos] != '<') { curPos++; } word = s.Substring(prevPos, curPos - prevPos); // eat the following witespace as suffix prevPos = curPos; while (curPos < s.Length && (char.IsControl(s[curPos]) || char.IsWhiteSpace(s[curPos]))) { curPos++; } suffix += s.Substring(prevPos, curPos - prevPos); words.Add(new Word(word, prefix, suffix)); prefix = string.Empty; suffix = string.Empty; } else { // eat the word prevPos = curPos; while (curPos < s.Length && !char.IsControl(s[curPos]) && !char.IsWhiteSpace(s[curPos]) && s[curPos] != '<' && s[curPos] != '&') { curPos++; } word = s.Substring(prevPos, curPos - prevPos); // if there are newlines or spaces follow // the word, consume it as suffix prevPos = curPos; while (curPos < s.Length && (char.IsControl(s[curPos]) || char.IsWhiteSpace(s[curPos]))) { curPos++; } suffix = s.Substring(prevPos, curPos - prevPos); ProcessWord(words, prefix, word, suffix); prefix = string.Empty; suffix = string.Empty; } } } return(words); }
/// <summary> /// Overloaded. Copy this WordsCollection to another one /// starting at the index zero /// </summary> /// <param name="col" type="WordCollection"> /// WordsCollection to copy to /// </param> public void CopyTo(WordsCollection col) { CopyTo(col, 0); }