Ejemplo n.º 1
0
        /// <summary>
        /// Low level api to get the most relevant (formatted) sections of the document.
        /// This method has been made public to allow visibility of score information held in TextFragment objects.
        /// Thanks to Jason Calabrese for help in redefining the interface.
        /// </summary>
        public TextFragment[] GetBestTextFragments(
            TokenStream tokenStream,
            String text,
            bool mergeContiguousFragments,
            int maxNumFragments)
        {
            var docFrags = new List <TextFragment>();
            var newText  = new StringBuilder();

            var termAtt   = tokenStream.AddAttribute <ITermAttribute>();
            var offsetAtt = tokenStream.AddAttribute <IOffsetAttribute>();

            tokenStream.AddAttribute <IPositionIncrementAttribute>();
            tokenStream.Reset();

            var currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
            var newStream   = _fragmentScorer.Init(tokenStream);

            if (newStream != null)
            {
                tokenStream = newStream;
            }
            _fragmentScorer.StartFragment(currentFrag);
            docFrags.Add(currentFrag);

            var fragQueue = new FragmentQueue(maxNumFragments);

            try
            {
                String tokenText;
                int    startOffset;
                int    endOffset;
                int    lastEndOffset = 0;
                _textFragmenter.Start(text, tokenStream);

                var tokenGroup = new TokenGroup(tokenStream);

                for (bool next = tokenStream.IncrementToken();
                     next && (offsetAtt.StartOffset < _maxDocCharsToAnalyze);
                     next = tokenStream.IncrementToken())
                {
                    if ((offsetAtt.EndOffset > text.Length)
                        ||
                        (offsetAtt.StartOffset > text.Length)
                        )
                    {
                        throw new InvalidTokenOffsetsException("Token " + termAtt.Term
                                                               + " exceeds length of provided text sized " + text.Length);
                    }
                    if ((tokenGroup.NumTokens > 0) && (tokenGroup.IsDistinct()))
                    {
                        //the current token is distinct from previous tokens -
                        // markup the cached token group info
                        startOffset = tokenGroup.MatchStartOffset;
                        endOffset   = tokenGroup.MatchEndOffset;
                        tokenText   = text.Substring(startOffset, endOffset - startOffset);
                        String markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
                        //store any whitespace etc from between this and last group
                        if (startOffset > lastEndOffset)
                        {
                            newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
                        }
                        newText.Append(markedUpText);
                        lastEndOffset = Math.Max(endOffset, lastEndOffset);
                        tokenGroup.Clear();

                        //check if current token marks the start of a new fragment
                        if (_textFragmenter.IsNewFragment())
                        {
                            currentFrag.Score = _fragmentScorer.FragmentScore;
                            //record stats for a new fragment
                            currentFrag.TextEndPos = newText.Length;
                            currentFrag            = new TextFragment(newText, newText.Length, docFrags.Count);
                            _fragmentScorer.StartFragment(currentFrag);
                            docFrags.Add(currentFrag);
                        }
                    }

                    tokenGroup.AddToken(_fragmentScorer.GetTokenScore());

                    //                if(lastEndOffset>maxDocBytesToAnalyze)
                    //                {
                    //                    break;
                    //                }
                }
                currentFrag.Score = _fragmentScorer.FragmentScore;

                if (tokenGroup.NumTokens > 0)
                {
                    //flush the accumulated text (same code as in above loop)
                    startOffset = tokenGroup.MatchStartOffset;
                    endOffset   = tokenGroup.MatchEndOffset;
                    tokenText   = text.Substring(startOffset, endOffset - startOffset);
                    var markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
                    //store any whitespace etc from between this and last group
                    if (startOffset > lastEndOffset)
                    {
                        newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
                    }
                    newText.Append(markedUpText);
                    lastEndOffset = Math.Max(lastEndOffset, endOffset);
                }

                //Test what remains of the original text beyond the point where we stopped analyzing
                if (
                    //                    if there is text beyond the last token considered..
                    (lastEndOffset < text.Length)
                    &&
                    //                    and that text is not too large...
                    (text.Length <= _maxDocCharsToAnalyze)
                    )
                {
                    //append it to the last fragment
                    newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset)));
                }

                currentFrag.TextEndPos = newText.Length;

                //sort the most relevant sections of the text
                foreach (var f in docFrags)
                {
                    currentFrag = f;

                    //If you are running with a version of Lucene before 11th Sept 03
                    // you do not have PriorityQueue.insert() - so uncomment the code below

                    /*
                     *                  if (currentFrag.getScore() >= minScore)
                     *                  {
                     *                      fragQueue.put(currentFrag);
                     *                      if (fragQueue.size() > maxNumFragments)
                     *                      { // if hit queue overfull
                     *                          fragQueue.pop(); // remove lowest in hit queue
                     *                          minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                     *                      }
                     *
                     *
                     *                  }
                     */
                    //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
                    //fix to PriorityQueue. The correct method to use here is the new "insert" method
                    // USE ABOVE CODE IF THIS DOES NOT COMPILE!
                    fragQueue.InsertWithOverflow(currentFrag);
                }

                //return the most relevant fragments
                var frag = new TextFragment[fragQueue.Size()];
                for (int i = frag.Length - 1; i >= 0; i--)
                {
                    frag[i] = fragQueue.Pop();
                }

                //merge any contiguous fragments to improve readability
                if (mergeContiguousFragments)
                {
                    MergeContiguousFragments(frag);
                    frag = frag.Where(t => (t != null) && (t.Score > 0)).ToArray();
                }

                return(frag);
            }
            finally
            {
                if (tokenStream != null)
                {
                    try
                    {
                        tokenStream.Close();
                    }
                    catch (Exception)
                    {
                    }
                }
            }
        }