public void ReadXml(XmlReader reader) { FullText = Convert.ToBoolean(reader.GetAttribute("FullText")); reader.MoveToContent(); if (reader.IsEmptyElement) // no <snippetline>...</snippetline> { return; } reader.Read(); reader.MoveToContent(); // Keep doing this, to skip over the whitespaces while (reader.Name == "SnippetLine" && reader.NodeType == XmlNodeType.Element) { string snippet_line_string = reader.ReadOuterXml(); // could be really huge for full text reader.MoveToContent(); SnippetLine snippet_line = (SnippetLine)snippet_line_ser.Deserialize(new StringReader(snippet_line_string)); if (snippets == null) { snippets = new ArrayList(); } snippets.Add(snippet_line); } reader.ReadEndElement(); }
private static List<SnippetLine> GetSnippet (SnippetRequest request) { Queryable queryable = QueryDriver.GetQueryable (request.Hit.Source); ISnippetReader snippet_reader; bool full_text = request.FullText; int ctx_length = request.ContextLength; int snp_length = request.SnippetLength; if (queryable == null) { Console.WriteLine ("SnippetExecutor: No queryable object matches '{0}'", request.Hit.Source); snippet_reader = new SnippetReader (null, null, false, -1, -1); full_text = false; } else snippet_reader = queryable.GetSnippet (request.QueryTerms, request.Hit, full_text, ctx_length, snp_length); List<SnippetLine> snippetlines = new List<SnippetLine> (); if (snippet_reader == null) return snippetlines; if (! full_text) { foreach (SnippetLine snippet_line in snippet_reader.GetSnippet ()) snippetlines.Add (snippet_line); } else { SnippetLine snippet_line = new SnippetLine (); snippet_line.Line = 1; Fragment fragment = new Fragment (); fragment.QueryTermIndex = -1; StringBuilder sb = new StringBuilder (); string line; // Read data from snippet_reader and write while ((line = snippet_reader.ReadLine ()) != null) { sb.Append (StringFu.CleanupInvalidXmlCharacters (line)); sb.Append ("\n"); } fragment.Text = sb.ToString (); snippet_line.Fragments = new ArrayList (); snippet_line.Fragments.Add (fragment); snippetlines.Add (snippet_line); } snippet_reader.Close (); return snippetlines; }
// Starts scanning at character pos of string text for occurrence of any word // in stemmed_terms. Returns a list of (words)*[(matched word)(words)*]+ private SnippetLine MarkTerms (ArrayList stemmed_terms, string text, ref int pos) { SnippetLine snippet_line = null; int prev_match_end_pos = pos; // misnomer; means 1 + end_pos of previous word // 1. get next word // 2. if no next word, return arraylist // 3. if word is not a match, following_words ++ // 4. else { // 4a. add list to the arraylist // 4b. add word to the arraylist // 4c. clear list // 4d. following_words=0 // } // 5. if (following_words >= max_following_words) { // 5a. add list to the arraylist // 5b. clear list // 5c. return list // } while (pos < text.Length) { // Find the beginning of the next token if (IsTokenSeparator (text [pos])) { ++ pos; continue; } // Find the end of the next token int end_pos = pos + 1; while (end_pos < text.Length && ! IsTokenSeparator (text [end_pos])) ++ end_pos; string token = text.Substring (pos, end_pos - pos); string stemmed_token = null; bool found_match = false; // Iterate through the stemmed terms and match the token for (int i = 0; i < stemmed_terms.Count; i++) { // If this term is longer than the token in question, give up. if (end_pos - pos < ((string)stemmed_terms [i]).Length) continue; // We cache the token, so as to avoid stemming it more than once // when considering multiple terms. if (stemmed_token == null) { stemmed_token = LuceneCommon.Stem (token.ToLower ()); } if (String.Compare ((string) stemmed_terms [i], stemmed_token, true) != 0) continue; // We have a match! found_match = true; //Console.WriteLine ("Found match"); if (snippet_line == null) snippet_line = new SnippetLine (); // Find the fragment before the match int start_pos = sliding_window.StartValue; if (start_pos == -1) // If no non-match words seen after last match start_pos = prev_match_end_pos; // Use wherever previous word ended sliding_window.Reset (); string before_match = text.Substring (start_pos, pos - start_pos); snippet_line.AddNonMatchFragment (before_match); //Console.WriteLine ("Adding [{0}, {1}]:[{2}]", start_pos, pos - 1, before_match); snippet_line.AddMatchFragment (i, token); //Console.WriteLine ("Adding word [{0}, {1}]:[{2}]", pos, end_pos - 1, token); prev_match_end_pos = end_pos; break; } if (! found_match) { // Add the start pos of the token to the window sliding_window.Add (pos); // If we found a match previously and saw enough following words, stop if (snippet_line != null && snippet_line.Count > 0 && sliding_window.Count == context_length) { sliding_window.Reset (); string after_match = text.Substring (prev_match_end_pos, end_pos - prev_match_end_pos); snippet_line.AddNonMatchFragment (after_match); //Console.WriteLine ("Adding [{0}, {1}]:[{2}]", prev_match_end_pos, end_pos - 1, after_match); return snippet_line; } } pos = end_pos; } // If less than 6 words came after the last match, add the rest here if (snippet_line != null && snippet_line.Count > 0) { sliding_window.Reset (); string after_match = text.Substring (prev_match_end_pos, pos - prev_match_end_pos); snippet_line.AddNonMatchFragment (after_match); //Console.WriteLine ("Adding [{0}, {1}]:[{2}]", prev_match_end_pos, pos - 1, after_match); //Console.WriteLine ("Sending snippet: {0}", snippet_line.ToString ()); return snippet_line; } sliding_window.Reset (); return null; }