/* (non-Javadoc)
		* @see Lucene.Net.Highlight.FragmentScorer#startFragment(Lucene.Net.Highlight.TextFragment)
		*/
		public virtual void  StartFragment(TextFragment newFragment)
		{
			uniqueTermsInFragment = new System.Collections.Hashtable();
			currentTextFragment = newFragment;
			totalScore = 0;
		}
Exemple #2
0
		/// <param name="frag2">Fragment to be merged into this one
		/// </param>
		public virtual void  Merge(TextFragment frag2)
		{
			textEndPos = frag2.textEndPos;
			score = System.Math.Max(score, frag2.score);
		}
Exemple #3
0
		/// <param name="fragment">
		/// </param>
		/// <returns> true if this fragment follows the one passed
		/// </returns>
		public virtual bool Follows(TextFragment fragment)
		{
			return textStartPos == fragment.textEndPos;
		}
		/// <summary> Low level api to get the most relevant (formatted) sections of the document.
		/// This method has been made public to allow visibility of score information held in TextFragment objects.
		/// Thanks to Jason Calabrese for help in redefining the interface.  
		/// </summary>
		/// <param name="">tokenStream
		/// </param>
		/// <param name="">text
		/// </param>
		/// <param name="">maxNumFragments
		/// </param>
		/// <param name="">mergeContiguousFragments
		/// </param>
		/// <throws>  IOException </throws>
		public TextFragment[] GetBestTextFragments(TokenStream tokenStream, System.String text, bool mergeContiguousFragments, int maxNumFragments)
		{
			System.Collections.ArrayList docFrags = new System.Collections.ArrayList();
			System.Text.StringBuilder newText = new System.Text.StringBuilder();
			
			TextFragment currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
			fragmentScorer.StartFragment(currentFrag);
			docFrags.Add(currentFrag);
			
			FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
			
			try
			{
				Lucene.Net.Analysis.Token token;
				System.String tokenText;
				int startOffset;
				int endOffset;
				int lastEndOffset = 0;
				textFragmenter.Start(text);
				
				TokenGroup tokenGroup = new TokenGroup();
				token = tokenStream.Next();
				while ((token != null) && (token.StartOffset() < maxDocBytesToAnalyze))
				{
					if ((tokenGroup.numTokens > 0) && (tokenGroup.IsDistinct(token)))
					{
						//the current token is distinct from previous tokens - 
						// markup the cached token group info
						startOffset = tokenGroup.matchStartOffset;
						endOffset = tokenGroup.matchEndOffset;
						tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
						System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
						//store any whitespace etc from between this and last group
						if (startOffset > lastEndOffset)
							newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
						newText.Append(markedUpText);
						lastEndOffset = System.Math.Max(endOffset, lastEndOffset);
						tokenGroup.Clear();
						
						//check if current token marks the start of a new fragment						
						if (textFragmenter.IsNewFragment(token))
						{
							currentFrag.SetScore(fragmentScorer.GetFragmentScore());
							//record stats for a new fragment
							currentFrag.textEndPos = newText.Length;
							currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
							fragmentScorer.StartFragment(currentFrag);
							docFrags.Add(currentFrag);
						}
					}
					
					tokenGroup.AddToken(token, fragmentScorer.GetTokenScore(token));
					
					//				if(lastEndOffset>maxDocBytesToAnalyze)
					//				{
					//					break;
					//				}
					token = tokenStream.Next();
				}
				currentFrag.SetScore(fragmentScorer.GetFragmentScore());
				
				if (tokenGroup.numTokens > 0)
				{
					//flush the accumulated text (same code as in above loop)
					startOffset = tokenGroup.matchStartOffset;
					endOffset = tokenGroup.matchEndOffset;
					tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
					System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
					//store any whitespace etc from between this and last group
					if (startOffset > lastEndOffset)
						newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
					newText.Append(markedUpText);
					lastEndOffset = System.Math.Max(lastEndOffset, endOffset);
				}
				
				//Test what remains of the original text beyond the point where we stopped analyzing 
				if ((lastEndOffset < text.Length) && (text.Length < maxDocBytesToAnalyze))
				{
					//append it to the last fragment
					newText.Append(encoder.EncodeText(text.Substring(lastEndOffset)));
				}
				
				currentFrag.textEndPos = newText.Length;
				
				//sort the most relevant sections of the text
				for (System.Collections.IEnumerator i = docFrags.GetEnumerator(); i.MoveNext(); )
				{
					currentFrag = (TextFragment) i.Current;
					
					//If you are running with a version of Lucene before 11th Sept 03
					// you do not have PriorityQueue.insert() - so uncomment the code below					
					/*
					if (currentFrag.getScore() >= minScore)
					{
					fragQueue.put(currentFrag);
					if (fragQueue.size() > maxNumFragments)
					{ // if hit queue overfull
					fragQueue.pop(); // remove lowest in hit queue
					minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
					}
					
					
					}
					*/
					//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
					//fix to PriorityQueue. The correct method to use here is the new "insert" method
					// USE ABOVE CODE IF THIS DOES NOT COMPILE!
					fragQueue.Insert(currentFrag);
				}
				
				//return the most relevant fragments
				TextFragment[] frag = new TextFragment[fragQueue.Size()];
				for (int i = frag.Length - 1; i >= 0; i--)
				{
					frag[i] = (TextFragment) fragQueue.Pop();
				}
				
				//merge any contiguous fragments to improve readability
				if (mergeContiguousFragments)
				{
					MergeContiguousFragments(frag);
					System.Collections.ArrayList fragTexts = new System.Collections.ArrayList();
					for (int i = 0; i < frag.Length; i++)
					{
						if ((frag[i] != null) && (frag[i].GetScore() > 0))
						{
							fragTexts.Add(frag[i]);
						}
					}
					frag = (TextFragment[]) fragTexts.ToArray(typeof(TextFragment));
				}
				
				return frag;
			}
			finally
			{
				if (tokenStream != null)
				{
					try
					{
						tokenStream.Close();
					}
					catch (System.Exception e)
					{
					}
				}
			}
		}
		/// <summary>Improves readability of a score-sorted list of TextFragments by merging any fragments 
		/// that were contiguous in the original text into one larger fragment with the correct order.
		/// This will leave a "null" in the array entry for the lesser scored fragment. 
		/// 
		/// </summary>
		/// <param name="frag">An array of document fragments in descending score
		/// </param>
		private void  MergeContiguousFragments(TextFragment[] frag)
		{
			bool mergingStillBeingDone;
			if (frag.Length > 1)
				do 
				{
					mergingStillBeingDone = false; //initialise loop control flag
					//for each fragment, scan other frags looking for contiguous blocks
					for (int i = 0; i < frag.Length; i++)
					{
						if (frag[i] == null)
						{
							continue;
						}
						//merge any contiguous blocks 
						for (int x = 0; x < frag.Length; x++)
						{
							if (frag[x] == null)
							{
								continue;
							}
							if (frag[i] == null)
							{
								break;
							}
							TextFragment frag1 = null;
							TextFragment frag2 = null;
							int frag1Num = 0;
							int frag2Num = 0;
							int bestScoringFragNum;
							int worstScoringFragNum;
							//if blocks are contiguous....
							if (frag[i].Follows(frag[x]))
							{
								frag1 = frag[x];
								frag1Num = x;
								frag2 = frag[i];
								frag2Num = i;
							}
							else if (frag[x].Follows(frag[i]))
							{
								frag1 = frag[i];
								frag1Num = i;
								frag2 = frag[x];
								frag2Num = x;
							}
							//merging required..
							if (frag1 != null)
							{
								if (frag1.GetScore() > frag2.GetScore())
								{
									bestScoringFragNum = frag1Num;
									worstScoringFragNum = frag2Num;
								}
								else
								{
									bestScoringFragNum = frag2Num;
									worstScoringFragNum = frag1Num;
								}
								frag1.Merge(frag2);
								frag[worstScoringFragNum] = null;
								mergingStillBeingDone = true;
								frag[bestScoringFragNum] = frag1;
							}
						}
					}
				}
				while (mergingStillBeingDone);
		}
Exemple #6
0
 /* (non-Javadoc)
  * @see Lucene.Net.Highlight.FragmentScorer#startFragment(Lucene.Net.Highlight.TextFragment)
  */
 public virtual void  StartFragment(TextFragment newFragment)
 {
     uniqueTermsInFragment = new System.Collections.Hashtable();
     currentTextFragment   = newFragment;
     totalScore            = 0;
 }
Exemple #7
0
        /// <summary>Improves readability of a score-sorted list of TextFragments by merging any fragments
        /// that were contiguous in the original text into one larger fragment with the correct order.
        /// This will leave a "null" in the array entry for the lesser scored fragment.
        ///
        /// </summary>
        /// <param name="frag">An array of document fragments in descending score
        /// </param>
        private void  MergeContiguousFragments(TextFragment[] frag)
        {
            bool mergingStillBeingDone;

            if (frag.Length > 1)
            {
                do
                {
                    mergingStillBeingDone = false;                     //initialise loop control flag
                    //for each fragment, scan other frags looking for contiguous blocks
                    for (int i = 0; i < frag.Length; i++)
                    {
                        if (frag[i] == null)
                        {
                            continue;
                        }
                        //merge any contiguous blocks
                        for (int x = 0; x < frag.Length; x++)
                        {
                            if (frag[x] == null)
                            {
                                continue;
                            }
                            if (frag[i] == null)
                            {
                                break;
                            }
                            TextFragment frag1    = null;
                            TextFragment frag2    = null;
                            int          frag1Num = 0;
                            int          frag2Num = 0;
                            int          bestScoringFragNum;
                            int          worstScoringFragNum;
                            //if blocks are contiguous....
                            if (frag[i].Follows(frag[x]))
                            {
                                frag1    = frag[x];
                                frag1Num = x;
                                frag2    = frag[i];
                                frag2Num = i;
                            }
                            else if (frag[x].Follows(frag[i]))
                            {
                                frag1    = frag[i];
                                frag1Num = i;
                                frag2    = frag[x];
                                frag2Num = x;
                            }
                            //merging required..
                            if (frag1 != null)
                            {
                                if (frag1.GetScore() > frag2.GetScore())
                                {
                                    bestScoringFragNum  = frag1Num;
                                    worstScoringFragNum = frag2Num;
                                }
                                else
                                {
                                    bestScoringFragNum  = frag2Num;
                                    worstScoringFragNum = frag1Num;
                                }
                                frag1.Merge(frag2);
                                frag[worstScoringFragNum] = null;
                                mergingStillBeingDone     = true;
                                frag[bestScoringFragNum]  = frag1;
                            }
                        }
                    }
                }while (mergingStillBeingDone);
            }
        }
Exemple #8
0
        /// <summary> Low level api to get the most relevant (formatted) sections of the document.
        /// This method has been made public to allow visibility of score information held in TextFragment objects.
        /// Thanks to Jason Calabrese for help in redefining the interface.
        /// </summary>
        /// <param name="">tokenStream
        /// </param>
        /// <param name="">text
        /// </param>
        /// <param name="">maxNumFragments
        /// </param>
        /// <param name="">mergeContiguousFragments
        /// </param>
        /// <throws>  IOException </throws>
        public TextFragment[] GetBestTextFragments(TokenStream tokenStream, System.String text, bool mergeContiguousFragments, int maxNumFragments)
        {
            System.Collections.ArrayList docFrags = new System.Collections.ArrayList();
            System.Text.StringBuilder    newText  = new System.Text.StringBuilder();

            TextFragment currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);

            fragmentScorer.StartFragment(currentFrag);
            docFrags.Add(currentFrag);

            FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

            try
            {
                Lucene.Net.Analysis.Token token;
                System.String             tokenText;
                int startOffset;
                int endOffset;
                int lastEndOffset = 0;
                textFragmenter.Start(text);

                TokenGroup tokenGroup = new TokenGroup();
                token = tokenStream.Next();
                while ((token != null) && (token.StartOffset() < maxDocBytesToAnalyze))
                {
                    if ((tokenGroup.numTokens > 0) && (tokenGroup.IsDistinct(token)))
                    {
                        //the current token is distinct from previous tokens -
                        // markup the cached token group info
                        startOffset = tokenGroup.matchStartOffset;
                        endOffset   = tokenGroup.matchEndOffset;
                        tokenText   = text.Substring(startOffset, (endOffset) - (startOffset));
                        System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
                        //store any whitespace etc from between this and last group
                        if (startOffset > lastEndOffset)
                        {
                            newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
                        }
                        newText.Append(markedUpText);
                        lastEndOffset = System.Math.Max(endOffset, lastEndOffset);
                        tokenGroup.Clear();

                        //check if current token marks the start of a new fragment
                        if (textFragmenter.IsNewFragment(token))
                        {
                            currentFrag.SetScore(fragmentScorer.GetFragmentScore());
                            //record stats for a new fragment
                            currentFrag.textEndPos = newText.Length;
                            currentFrag            = new TextFragment(newText, newText.Length, docFrags.Count);
                            fragmentScorer.StartFragment(currentFrag);
                            docFrags.Add(currentFrag);
                        }
                    }

                    tokenGroup.AddToken(token, fragmentScorer.GetTokenScore(token));

                    //				if(lastEndOffset>maxDocBytesToAnalyze)
                    //				{
                    //					break;
                    //				}
                    token = tokenStream.Next();
                }
                currentFrag.SetScore(fragmentScorer.GetFragmentScore());

                if (tokenGroup.numTokens > 0)
                {
                    //flush the accumulated text (same code as in above loop)
                    startOffset = tokenGroup.matchStartOffset;
                    endOffset   = tokenGroup.matchEndOffset;
                    tokenText   = text.Substring(startOffset, (endOffset) - (startOffset));
                    System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
                    //store any whitespace etc from between this and last group
                    if (startOffset > lastEndOffset)
                    {
                        newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
                    }
                    newText.Append(markedUpText);
                    lastEndOffset = System.Math.Max(lastEndOffset, endOffset);
                }

                //Test what remains of the original text beyond the point where we stopped analyzing
                if ((lastEndOffset < text.Length) && (text.Length < maxDocBytesToAnalyze))
                {
                    //append it to the last fragment
                    newText.Append(encoder.EncodeText(text.Substring(lastEndOffset)));
                }

                currentFrag.textEndPos = newText.Length;

                //sort the most relevant sections of the text
                for (System.Collections.IEnumerator i = docFrags.GetEnumerator(); i.MoveNext();)
                {
                    currentFrag = (TextFragment)i.Current;

                    //If you are running with a version of Lucene before 11th Sept 03
                    // you do not have PriorityQueue.insert() - so uncomment the code below

                    /*
                     * if (currentFrag.getScore() >= minScore)
                     * {
                     * fragQueue.put(currentFrag);
                     * if (fragQueue.size() > maxNumFragments)
                     * { // if hit queue overfull
                     * fragQueue.pop(); // remove lowest in hit queue
                     * minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                     * }
                     *
                     *
                     * }
                     */
                    //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
                    //fix to PriorityQueue. The correct method to use here is the new "insert" method
                    // USE ABOVE CODE IF THIS DOES NOT COMPILE!
                    fragQueue.Insert(currentFrag);
                }

                //return the most relevant fragments
                TextFragment[] frag = new TextFragment[fragQueue.Size()];
                for (int i = frag.Length - 1; i >= 0; i--)
                {
                    frag[i] = (TextFragment)fragQueue.Pop();
                }

                //merge any contiguous fragments to improve readability
                if (mergeContiguousFragments)
                {
                    MergeContiguousFragments(frag);
                    System.Collections.ArrayList fragTexts = new System.Collections.ArrayList();
                    for (int i = 0; i < frag.Length; i++)
                    {
                        if ((frag[i] != null) && (frag[i].GetScore() > 0))
                        {
                            fragTexts.Add(frag[i]);
                        }
                    }
                    frag = (TextFragment[])fragTexts.ToArray(typeof(TextFragment));
                }

                return(frag);
            }
            finally
            {
                if (tokenStream != null)
                {
                    try
                    {
                        tokenStream.Close();
                    }
                    catch (System.Exception e)
                    {
                    }
                }
            }
        }
Exemple #9
0
 /// <param name="fragment">
 /// </param>
 /// <returns> true if this fragment follows the one passed
 /// </returns>
 public virtual bool Follows(TextFragment fragment)
 {
     return(textStartPos == fragment.textEndPos);
 }
Exemple #10
0
 /// <param name="frag2">Fragment to be merged into this one
 /// </param>
 public virtual void  Merge(TextFragment frag2)
 {
     textEndPos = frag2.textEndPos;
     score      = System.Math.Max(score, frag2.score);
 }