// Starts scanning at character pos of string text for occurrence of any word // in stemmed_terms. Returns a list of (words)*[(matched word)(words)*]+ private SnippetLine MarkTerms(ArrayList stemmed_terms, string text, ref int pos) { SnippetLine snippet_line = null; int prev_match_end_pos = pos; // misnomer; means 1 + end_pos of previous word // 1. get next word // 2. if no next word, return arraylist // 3. if word is not a match, following_words ++ // 4. else { // 4a. add list to the arraylist // 4b. add word to the arraylist // 4c. clear list // 4d. following_words=0 // } // 5. if (following_words >= max_following_words) { // 5a. add list to the arraylist // 5b. clear list // 5c. return list // } while (pos < text.Length) { // Find the beginning of the next token if (IsTokenSeparator(text [pos])) { ++pos; continue; } // Find the end of the next token int end_pos = pos + 1; while (end_pos < text.Length && !IsTokenSeparator(text [end_pos])) { ++end_pos; } string token = text.Substring(pos, end_pos - pos); string stemmed_token = null; bool found_match = false; // Iterate through the stemmed terms and match the token for (int i = 0; i < stemmed_terms.Count; i++) { // If this term is longer than the token in question, give up. if (end_pos - pos < ((string)stemmed_terms [i]).Length) { continue; } // We cache the token, so as to avoid stemming it more than once // when considering multiple terms. if (stemmed_token == null) { stemmed_token = LuceneCommon.Stem(token.ToLower()); } if (String.Compare((string)stemmed_terms [i], stemmed_token, true) != 0) { continue; } // We have a match! found_match = true; //Console.WriteLine ("Found match"); if (snippet_line == null) { snippet_line = new SnippetLine(); } // Find the fragment before the match int start_pos = sliding_window.StartValue; if (start_pos == -1) // If no non-match words seen after last match { start_pos = prev_match_end_pos; // Use wherever previous word ended } sliding_window.Reset(); string before_match = text.Substring(start_pos, pos - start_pos); snippet_line.AddNonMatchFragment(before_match); //Console.WriteLine ("Adding [{0}, {1}]:[{2}]", start_pos, pos - 1, before_match); snippet_line.AddMatchFragment(i, token); //Console.WriteLine ("Adding word [{0}, {1}]:[{2}]", pos, end_pos - 1, token); prev_match_end_pos = end_pos; break; } if (!found_match) { // Add the start pos of the token to the window sliding_window.Add(pos); // If we found a match previously and saw enough following words, stop if (snippet_line != null && snippet_line.Count > 0 && sliding_window.Count == context_length) { sliding_window.Reset(); string after_match = text.Substring(prev_match_end_pos, end_pos - prev_match_end_pos); snippet_line.AddNonMatchFragment(after_match); //Console.WriteLine ("Adding [{0}, {1}]:[{2}]", prev_match_end_pos, end_pos - 1, after_match); return(snippet_line); } } pos = end_pos; } // If less than 6 words came after the last match, add the rest here if (snippet_line != null && snippet_line.Count > 0) { sliding_window.Reset(); string after_match = text.Substring(prev_match_end_pos, pos - prev_match_end_pos); snippet_line.AddNonMatchFragment(after_match); //Console.WriteLine ("Adding [{0}, {1}]:[{2}]", prev_match_end_pos, pos - 1, after_match); //Console.WriteLine ("Sending snippet: {0}", snippet_line.ToString ()); return(snippet_line); } sliding_window.Reset(); return(null); }