Exemple #1
0
        // Starts scanning at character pos of string text for occurrence of any word
        // in stemmed_terms. Returns a list of (words)*[(matched word)(words)*]+
        private SnippetLine MarkTerms(ArrayList stemmed_terms, string text, ref int pos)
        {
            SnippetLine snippet_line       = null;
            int         prev_match_end_pos = pos;     // misnomer; means 1 + end_pos of previous word

            // 1. get next word
            // 2. if no next word, return arraylist
            // 3. if word is not a match, following_words ++
            // 4. else {
            // 4a. add list to the arraylist
            // 4b. add word to the arraylist
            // 4c. clear list
            // 4d. following_words=0
            // }
            // 5. if (following_words >= max_following_words) {
            // 5a. add list to the arraylist
            // 5b. clear list
            // 5c. return list
            // }

            while (pos < text.Length)
            {
                // Find the beginning of the next token
                if (IsTokenSeparator(text [pos]))
                {
                    ++pos;
                    continue;
                }

                // Find the end of the next token
                int end_pos = pos + 1;
                while (end_pos < text.Length && !IsTokenSeparator(text [end_pos]))
                {
                    ++end_pos;
                }

                string token         = text.Substring(pos, end_pos - pos);
                string stemmed_token = null;
                bool   found_match   = false;

                // Iterate through the stemmed terms and match the token
                for (int i = 0; i < stemmed_terms.Count; i++)
                {
                    // If this term is longer than the token in question, give up.
                    if (end_pos - pos < ((string)stemmed_terms [i]).Length)
                    {
                        continue;
                    }

                    // We cache the token, so as to avoid stemming it more than once
                    // when considering multiple terms.
                    if (stemmed_token == null)
                    {
                        stemmed_token = LuceneCommon.Stem(token.ToLower());
                    }

                    if (String.Compare((string)stemmed_terms [i], stemmed_token, true) != 0)
                    {
                        continue;
                    }

                    // We have a match!
                    found_match = true;
                    //Console.WriteLine ("Found match");

                    if (snippet_line == null)
                    {
                        snippet_line = new SnippetLine();
                    }

                    // Find the fragment before the match
                    int start_pos = sliding_window.StartValue;
                    if (start_pos == -1)                     // If no non-match words seen after last match
                    {
                        start_pos = prev_match_end_pos;      // Use wherever previous word ended
                    }
                    sliding_window.Reset();

                    string before_match = text.Substring(start_pos, pos - start_pos);
                    snippet_line.AddNonMatchFragment(before_match);
                    //Console.WriteLine ("Adding [{0}, {1}]:[{2}]", start_pos, pos - 1, before_match);

                    snippet_line.AddMatchFragment(i, token);
                    //Console.WriteLine ("Adding word [{0}, {1}]:[{2}]", pos, end_pos - 1, token);
                    prev_match_end_pos = end_pos;

                    break;
                }

                if (!found_match)
                {
                    // Add the start pos of the token to the window
                    sliding_window.Add(pos);
                    // If we found a match previously and saw enough following words, stop
                    if (snippet_line != null && snippet_line.Count > 0 && sliding_window.Count == context_length)
                    {
                        sliding_window.Reset();
                        string after_match = text.Substring(prev_match_end_pos, end_pos - prev_match_end_pos);
                        snippet_line.AddNonMatchFragment(after_match);
                        //Console.WriteLine ("Adding [{0}, {1}]:[{2}]", prev_match_end_pos, end_pos - 1, after_match);
                        return(snippet_line);
                    }
                }

                pos = end_pos;
            }

            // If less than 6 words came after the last match, add the rest here
            if (snippet_line != null && snippet_line.Count > 0)
            {
                sliding_window.Reset();
                string after_match = text.Substring(prev_match_end_pos, pos - prev_match_end_pos);
                snippet_line.AddNonMatchFragment(after_match);
                //Console.WriteLine ("Adding [{0}, {1}]:[{2}]", prev_match_end_pos, pos - 1, after_match);

                //Console.WriteLine ("Sending snippet: {0}", snippet_line.ToString ());
                return(snippet_line);
            }

            sliding_window.Reset();
            return(null);
        }