Esempio n. 1
0
 public static void processFragment(ref TextFragment frag)
 {
     if (frag.GetScore() > 0 && !frag.Visited)
     {
         frag.MarkedUpText.Append("!!!!");
         frag.Visited = true;
     }
 }
Esempio n. 2
0
 /// <seealso cref="IScorer.StartFragment"/>
 public virtual void StartFragment(TextFragment newFragment)
 {
     foundTerms = new JCG.HashSet <string>();
     totalScore = 0;
 }
Esempio n. 3
0
        /// <summary>
        /// Improves readability of a score-sorted list of TextFragments by merging any fragments
        /// that were contiguous in the original text into one larger fragment with the correct order.
        /// This will leave a "null" in the array entry for the lesser scored fragment.
        /// </summary>
        /// <param name="frag">An array of document fragments in descending score</param>
        private static void MergeContiguousFragments(TextFragment[] frag) // LUCENENET: CA1822: Mark members as static
        {
            bool mergingStillBeingDone;

            if (frag.Length > 1)
            {
                do
                {
                    mergingStillBeingDone = false; //initialise loop control flag
                    //for each fragment, scan other frags looking for contiguous blocks
                    for (int i = 0; i < frag.Length; i++)
                    {
                        if (frag[i] == null)
                        {
                            continue;
                        }
                        //merge any contiguous blocks
                        for (int x = 0; x < frag.Length; x++)
                        {
                            if (frag[x] == null)
                            {
                                continue;
                            }
                            if (frag[i] == null)
                            {
                                break;
                            }
                            TextFragment frag1    = null;
                            TextFragment frag2    = null;
                            int          frag1Num = 0;
                            int          frag2Num = 0;
                            int          bestScoringFragNum;
                            int          worstScoringFragNum;
                            //if blocks are contiguous....
                            if (frag[i].Follows(frag[x]))
                            {
                                frag1    = frag[x];
                                frag1Num = x;
                                frag2    = frag[i];
                                frag2Num = i;
                            }
                            else if (frag[x].Follows(frag[i]))
                            {
                                frag1    = frag[i];
                                frag1Num = i;
                                frag2    = frag[x];
                                frag2Num = x;
                            }
                            //merging required..
                            if (frag1 != null)
                            {
                                if (frag1.Score > frag2.Score)
                                {
                                    bestScoringFragNum  = frag1Num;
                                    worstScoringFragNum = frag2Num;
                                }
                                else
                                {
                                    bestScoringFragNum  = frag2Num;
                                    worstScoringFragNum = frag1Num;
                                }
                                frag1.Merge(frag2);
                                frag[worstScoringFragNum] = null;
                                mergingStillBeingDone     = true;
                                frag[bestScoringFragNum]  = frag1;
                            }
                        }
                    }
                } while (mergingStillBeingDone);
            }
        }
Esempio n. 4
0
        /// <summary>
        /// Low level api to get the most relevant (formatted) sections of the document.
        /// This method has been made public to allow visibility of score information held in <see cref="TextFragment"/> objects.
        /// Thanks to Jason Calabrese for help in redefining the interface.
        /// </summary>
        /// <exception cref="IOException">If there is a low-level I/O error</exception>
        /// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
        public TextFragment[] GetBestTextFragments(
            TokenStream tokenStream,
            string text,
            bool mergeContiguousFragments,
            int maxNumFragments)
        {
            var docFrags = new JCG.List <TextFragment>();
            var newText  = new StringBuilder();

            var termAtt   = tokenStream.AddAttribute <ICharTermAttribute>();
            var offsetAtt = tokenStream.AddAttribute <IOffsetAttribute>();

            tokenStream.Reset();
            var currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);

            if (_fragmentScorer is QueryScorer queryScorer)
            {
                queryScorer.SetMaxDocCharsToAnalyze(_maxDocCharsToAnalyze);
            }

            var newStream = _fragmentScorer.Init(tokenStream);

            if (newStream != null)
            {
                tokenStream = newStream;
            }
            _fragmentScorer.StartFragment(currentFrag);
            docFrags.Add(currentFrag);

            var fragQueue = new FragmentQueue(maxNumFragments);

            try
            {
                string tokenText;
                int    startOffset;
                int    endOffset;
                int    lastEndOffset = 0;
                _textFragmenter.Start(text, tokenStream);

                var tokenGroup = new TokenGroup(tokenStream);

                for (bool next = tokenStream.IncrementToken();
                     next && (offsetAtt.StartOffset < _maxDocCharsToAnalyze);
                     next = tokenStream.IncrementToken())
                {
                    if ((offsetAtt.EndOffset > text.Length)
                        ||
                        (offsetAtt.StartOffset > text.Length)
                        )
                    {
                        throw new InvalidTokenOffsetsException("Token " + termAtt.ToString()
                                                               + " exceeds length of provided text sized " + text.Length);
                    }
                    if ((tokenGroup.NumTokens > 0) && (tokenGroup.IsDistinct()))
                    {
                        //the current token is distinct from previous tokens -
                        // markup the cached token group info
                        startOffset = tokenGroup.MatchStartOffset;
                        endOffset   = tokenGroup.MatchEndOffset;
                        tokenText   = text.Substring(startOffset, endOffset - startOffset);
                        string markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
                        //store any whitespace etc from between this and last group
                        if (startOffset > lastEndOffset)
                        {
                            newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
                        }
                        newText.Append(markedUpText);
                        lastEndOffset = Math.Max(endOffset, lastEndOffset);
                        tokenGroup.Clear();

                        //check if current token marks the start of a new fragment
                        if (_textFragmenter.IsNewFragment())
                        {
                            currentFrag.Score = _fragmentScorer.FragmentScore;
                            //record stats for a new fragment
                            currentFrag.TextEndPos = newText.Length;
                            currentFrag            = new TextFragment(newText, newText.Length, docFrags.Count);
                            _fragmentScorer.StartFragment(currentFrag);
                            docFrags.Add(currentFrag);
                        }
                    }

                    tokenGroup.AddToken(_fragmentScorer.GetTokenScore());

                    //                if(lastEndOffset>maxDocBytesToAnalyze)
                    //                {
                    //                    break;
                    //                }
                }
                currentFrag.Score = _fragmentScorer.FragmentScore;

                if (tokenGroup.NumTokens > 0)
                {
                    //flush the accumulated text (same code as in above loop)
                    startOffset = tokenGroup.MatchStartOffset;
                    endOffset   = tokenGroup.MatchEndOffset;
                    tokenText   = text.Substring(startOffset, endOffset - startOffset);
                    var markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
                    //store any whitespace etc from between this and last group
                    if (startOffset > lastEndOffset)
                    {
                        newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
                    }
                    newText.Append(markedUpText);
                    lastEndOffset = Math.Max(lastEndOffset, endOffset);
                }

                //Test what remains of the original text beyond the point where we stopped analyzing
                if (
                    //                    if there is text beyond the last token considered..
                    (lastEndOffset < text.Length)
                    &&
                    //                    and that text is not too large...
                    (text.Length <= _maxDocCharsToAnalyze)
                    )
                {
                    //append it to the last fragment
                    newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset)));
                }

                currentFrag.TextEndPos = newText.Length;

                //sort the most relevant sections of the text
                foreach (var f in docFrags)
                {
                    currentFrag = f;

                    //If you are running with a version of Lucene before 11th Sept 03
                    // you do not have PriorityQueue.insert() - so uncomment the code below

                    /*
                     *                  if (currentFrag.getScore() >= minScore)
                     *                  {
                     *                      fragQueue.put(currentFrag);
                     *                      if (fragQueue.size() > maxNumFragments)
                     *                      { // if hit queue overfull
                     *                          fragQueue.pop(); // remove lowest in hit queue
                     *                          minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                     *                      }
                     *                  }
                     */
                    //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
                    //fix to PriorityQueue. The correct method to use here is the new "insert" method
                    // USE ABOVE CODE IF THIS DOES NOT COMPILE!
                    fragQueue.InsertWithOverflow(currentFrag);
                }

                //return the most relevant fragments
                var frag = new TextFragment[fragQueue.Count];
                for (int i = frag.Length - 1; i >= 0; i--)
                {
                    frag[i] = fragQueue.Pop();
                }

                //merge any contiguous fragments to improve readability
                if (mergeContiguousFragments)
                {
                    MergeContiguousFragments(frag);
                    JCG.List <TextFragment> fragTexts = new JCG.List <TextFragment>();
                    for (int i = 0; i < frag.Length; i++)
                    {
                        if ((frag[i] != null) && (frag[i].Score > 0))
                        {
                            fragTexts.Add(frag[i]);
                        }
                    }
                    frag = new TextFragment[fragTexts.Count];
                    fragTexts.CopyTo(frag);
                }

                return(frag);
            }
            finally
            {
                if (tokenStream != null)
                {
                    try
                    {
                        tokenStream.End();
                        tokenStream.Dispose();
                    }
                    catch (Exception e) when(e.IsException())
                    {
                    }
                }
            }
        }
Esempio n. 5
0
 /// <summary>
 /// true if this fragment follows the one passed
 /// </summary>
 public virtual bool Follows(TextFragment fragment)
 {
     return(TextStartPos == fragment.TextEndPos);
 }
Esempio n. 6
0
 /// <param name="frag2">Fragment to be merged into this one</param>
 public virtual void Merge(TextFragment frag2)
 {
     TextEndPos = frag2.TextEndPos;
     Score      = Math.Max(Score, frag2.Score);
 }
Esempio n. 7
0
 /// <seealso cref="IScorer.StartFragment"/>
 public void StartFragment(TextFragment newFragment)
 {
     foundTerms = Support.Compatibility.SetFactory.GetSet<string>();
     totalScore = 0;
 }
Esempio n. 8
0
 /// <seealso cref="IScorer.StartFragment"/>
 public virtual void StartFragment(TextFragment newFragment)
 {
     foundTerms = Support.Compatibility.SetFactory.CreateHashSet <string>();
     totalScore = 0;
 }
Esempio n. 9
0
 /// <param name="frag2">
 /// Fragment to be merged into this one
 /// </param>
 public virtual void Merge(TextFragment frag2)
 {
     textEndPos = frag2.textEndPos;
     score = System.Math.Max(score, frag2.score);
 }
Esempio n. 10
0
 /// <param name="fragment">
 /// </param>
 /// <returns> true if this fragment follows the one passed
 /// </returns>
 public virtual bool Follows(TextFragment fragment)
 {
     return textStartPos == fragment.textEndPos;
 }
Esempio n. 11
0
 /// <summary>Improves readability of a score-sorted list of TextFragments by merging any fragments 
 /// that were contiguous in the original text into one larger fragment with the correct order.
 /// This will leave a "null" in the array entry for the lesser scored fragment. 
 /// 
 /// </summary>
 /// <param name="frag">An array of document fragments in descending score
 /// </param>
 private void MergeContiguousFragments(TextFragment[] frag)
 {
     bool mergingStillBeingDone;
     if (frag.Length > 1)
         do
         {
             mergingStillBeingDone = false; //initialise loop control flag
             //for each fragment, scan other frags looking for contiguous blocks
             for (int i = 0; i < frag.Length; i++)
             {
                 if (frag[i] == null)
                 {
                     continue;
                 }
                 //merge any contiguous blocks
                 for (int x = 0; x < frag.Length; x++)
                 {
                     if (frag[x] == null)
                     {
                         continue;
                     }
                     if (frag[i] == null)
                     {
                         break;
                     }
                     TextFragment frag1 = null;
                     TextFragment frag2 = null;
                     int frag1Num = 0;
                     int frag2Num = 0;
                     int bestScoringFragNum;
                     int worstScoringFragNum;
                     //if blocks are contiguous....
                     if (frag[i].Follows(frag[x]))
                     {
                         frag1 = frag[x];
                         frag1Num = x;
                         frag2 = frag[i];
                         frag2Num = i;
                     }
                     else if (frag[x].Follows(frag[i]))
                     {
                         frag1 = frag[i];
                         frag1Num = i;
                         frag2 = frag[x];
                         frag2Num = x;
                     }
                     //merging required..
                     if (frag1 != null)
                     {
                         if (frag1.GetScore() > frag2.GetScore())
                         {
                             bestScoringFragNum = frag1Num;
                             worstScoringFragNum = frag2Num;
                         }
                         else
                         {
                             bestScoringFragNum = frag2Num;
                             worstScoringFragNum = frag1Num;
                         }
                         frag1.Merge(frag2);
                         frag[worstScoringFragNum] = null;
                         mergingStillBeingDone = true;
                         frag[bestScoringFragNum] = frag1;
                     }
                 }
             }
         }
         while (mergingStillBeingDone);
 }
Esempio n. 12
0
        /// <summary> Low level api to get the most relevant (formatted) sections of the document.
        /// This method has been made public to allow visibility of score information held in TextFragment objects.
        /// Thanks to Jason Calabrese for help in redefining the interface.  
        /// </summary>
        /// <param name="">tokenStream
        /// </param>
        /// <param name="">text
        /// </param>
        /// <param name="">maxNumFragments
        /// </param>
        /// <param name="">mergeContiguousFragments
        /// </param>
        /// <throws>  IOException </throws>
        public TextFragment[] GetBestTextFragments(TokenStream tokenStream, string text, bool mergeContiguousFragments, int maxNumFragments)
        {
            ArrayList docFrags = new ArrayList();
            StringBuilder newText = new StringBuilder();

            TextFragment currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
            fragmentScorer.StartFragment(currentFrag);
            docFrags.Add(currentFrag);

            FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

            try
            {
                Lucene.Net.Analysis.Token token;
                string tokenText;
                int startOffset;
                int endOffset;
                int lastEndOffset = 0;
                textFragmenter.Start(text);

                TokenGroup tokenGroup = new TokenGroup();

                while ((token = tokenStream.Next()) != null)
                {
                    if ((tokenGroup.numTokens > 0) && (tokenGroup.IsDistinct(token)))
                    {
                        //the current token is distinct from previous tokens -
                        // markup the cached token group info
                        startOffset = tokenGroup.startOffset;
                        endOffset = tokenGroup.endOffset;
                        tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
                        string markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
                        //store any whitespace etc from between this and last group
                        if (startOffset > lastEndOffset)
                            newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
                        newText.Append(markedUpText);
                        lastEndOffset = endOffset;
                        tokenGroup.Clear();

                        //check if current token marks the start of a new fragment
                        if (textFragmenter.IsNewFragment(token))
                        {
                            currentFrag.SetScore(fragmentScorer.FragmentScore);
                            //record stats for a new fragment
                            currentFrag.textEndPos = newText.Length;
                            currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
                            fragmentScorer.StartFragment(currentFrag);
                            docFrags.Add(currentFrag);
                        }
                    }

                    tokenGroup.AddToken(token, fragmentScorer.GetTokenScore(token));

                    if (lastEndOffset > maxDocBytesToAnalyze)
                    {
                        break;
                    }
                }
                currentFrag.SetScore(fragmentScorer.FragmentScore);

                if (tokenGroup.numTokens > 0)
                {
                    //flush the accumulated text (same code as in above loop)
                    startOffset = tokenGroup.startOffset;
                    endOffset = tokenGroup.endOffset;
                    tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
                    string markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
                    //store any whitespace etc from between this and last group
                    if (startOffset > lastEndOffset)
                        newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
                    newText.Append(markedUpText);
                    lastEndOffset = endOffset;
                }

                // append text after end of last token
                //			if (lastEndOffset < text.length())
                //				newText.append(encoder.encodeText(text.substring(lastEndOffset)));

                currentFrag.textEndPos = newText.Length;

                //sort the most relevant sections of the text
                for (IEnumerator i = docFrags.GetEnumerator(); i.MoveNext(); )
                {
                    currentFrag = (TextFragment) i.Current;

                    //If you are running with a version of Lucene before 11th Sept 03
                    // you do not have PriorityQueue.insert() - so uncomment the code below
                    /*
                    if (currentFrag.getScore() >= minScore)
                    {
                    fragQueue.put(currentFrag);
                    if (fragQueue.size() > maxNumFragments)
                    { // if hit queue overfull
                    fragQueue.pop(); // remove lowest in hit queue
                    minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                    }

                    }
                    */
                    //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
                    //fix to PriorityQueue. The correct method to use here is the new "insert" method
                    // USE ABOVE CODE IF THIS DOES NOT COMPILE!
                    fragQueue.Insert(currentFrag);
                }

                //return the most relevant fragments
                TextFragment[] frag = new TextFragment[fragQueue.Size()];
                for (int i = frag.Length - 1; i >= 0; i--)
                {
                    frag[i] = (TextFragment) fragQueue.Pop();
                }

                //merge any contiguous fragments to improve readability
                if (mergeContiguousFragments)
                {
                    MergeContiguousFragments(frag);
                    ArrayList fragTexts = new ArrayList();
                    for (int i = 0; i < frag.Length; i++)
                    {
                        if ((frag[i] != null) && (frag[i].GetScore() > 0))
                        {
                            fragTexts.Add(frag[i]);
                        }
                    }
                    //frag = (TextFragment[]) ICollectionSupport.ToArray(fragTexts, new TextFragment[0]);
                    frag = (TextFragment[]) fragTexts.ToArray(typeof(TextFragment));
                }

                return frag;
            }
            finally
            {
                if (tokenStream != null)
                {
                    try
                    {
                        tokenStream.Close();
                    }
                    catch (Exception e)
                    {
                        throw e;
                    }
                }
            }
        }
Esempio n. 13
0
 public virtual void StartFragment(TextFragment newFragment)
 {
     uniqueTermsInFragment = new HashSet <string>();
     currentTextFragment   = newFragment;
     totalScore            = 0;
 }
Esempio n. 14
0
 /// <summary></summary>
 /// <param name="frag2">Fragment to be merged into this one</param>
 public void Merge(TextFragment frag2)
 {
     TextEndPos = frag2.TextEndPos;
     Score = Math.Max(Score, frag2.Score);
 }
Esempio n. 15
0
 /// <summary>
 /// true if this fragment follows the one passed
 /// </summary>
 public bool Follows(TextFragment fragment)
 {
     return TextStartPos == fragment.TextEndPos;
 }
Esempio n. 16
0
 /*
  * (non-Javadoc)
  *
  * @see
  * org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache
  * .lucene.search.highlight.TextFragment)
  */
 public void StartFragment(TextFragment newFragment)
 {
     uniqueTermsInFragment = new HashSet<String>();
     currentTextFragment = newFragment;
     totalScore = 0;
 }
Esempio n. 17
0
 public virtual void StartFragment(TextFragment newFragment)
 {
     //uniqueTermsInFragment = new HashSetSupport();
     uniqueTermsInFragment = new Hashtable();
     currentTextFragment = newFragment;
     totalScore = 0;
 }
 /// <summary>
 /// Starts the fragment.
 /// </summary>
 /// <param name="newFragment">The new fragment.</param>
 public void StartFragment(TextFragment newFragment)
 {
     foundTerms = new HashSet<string>();
     totalScore = 0;
 }