Beispiel #1
0
        public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field)
        {
            TermFreqVector tfv = (TermFreqVector)reader.GetTermFreqVector(docId, field);

            if (tfv == null)
            {
                throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored");
            }
            if (tfv is TermPositionVector)
            {
                TermPositionVector tpv = (TermPositionVector)reader.GetTermFreqVector(docId, field);
                return(GetTokenStream(tpv));
            }
            throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored");
        }
Beispiel #2
0
        public void Test_SegmentTermVector_IndexOf()
        {
            Lucene.Net.Store.RAMDirectory directory = new Lucene.Net.Store.RAMDirectory();
            Lucene.Net.Analysis.Analyzer  analyzer  = new Lucene.Net.Analysis.WhitespaceAnalyzer();
            Lucene.Net.Index.IndexWriter  writer    = new Lucene.Net.Index.IndexWriter(directory, analyzer, Lucene.Net.Index.IndexWriter.MaxFieldLength.LIMITED);
            Lucene.Net.Documents.Document document  = new Lucene.Net.Documents.Document();
            document.Add(new Lucene.Net.Documents.Field("contents", new System.IO.StreamReader(new System.IO.MemoryStream(System.Text.Encoding.ASCII.GetBytes("a_ a0"))), Lucene.Net.Documents.Field.TermVector.WITH_OFFSETS));
            writer.AddDocument(document);
            Lucene.Net.Index.IndexReader        reader = writer.GetReader();
            Lucene.Net.Index.TermPositionVector tpv    = reader.GetTermFreqVector(0, "contents") as Lucene.Net.Index.TermPositionVector;
            //Console.WriteLine("tpv: " + tpv);
            int index = tpv.IndexOf("a_");

            Assert.AreEqual(index, 1, "See the issue: LUCENENET-183");
        }
		/// <summary> Low level api.
		/// Returns a token stream or null if no offset info available in index.
		/// This can be used to feed the highlighter with a pre-parsed token stream 
		/// 
		/// In my tests the speeds to recreate 1000 token streams using this method are:
		/// - with TermVector offset only data stored - 420  milliseconds 
		/// - with TermVector offset AND position data stored - 271 milliseconds
		/// (nb timings for TermVector with position data are based on a tokenizer with contiguous
		/// positions - no overlaps or gaps)
		/// The cost of not using TermPositionVector to store
		/// pre-parsed content and using an analyzer to re-parse the original content: 
		/// - reanalyzing the original content - 980 milliseconds
		/// 
		/// The re-analyze timings will typically vary depending on -
		/// 1) The complexity of the analyzer code (timings above were using a 
		/// stemmer/lowercaser/stopword combo)
		/// 2) The  number of other fields (Lucene reads ALL fields off the disk 
		/// when accessing just one document field - can cost dear!)
		/// 3) Use of compression on field storage - could be faster cos of compression (less disk IO)
		/// or slower (more CPU burn) depending on the content.
		/// 
		/// </summary>
		/// <param name="">tpv
		/// </param>
		/// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
		/// to eek out the last drops of performance, set to true. If in doubt, set to false.
		/// </param>
		public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)
		{
			//an object used to iterate across an array of tokens
			//code to reconstruct the original sequence of Tokens
			System.String[] terms = tpv.GetTerms();
			int[] freq = tpv.GetTermFrequencies();
			int totalTokens = 0;
			for (int t = 0; t < freq.Length; t++)
			{
				totalTokens += freq[t];
			}
			Token[] tokensInOriginalOrder = new Token[totalTokens];
			System.Collections.ArrayList unsortedTokens = null;
			for (int t = 0; t < freq.Length; t++)
			{
				TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t);
				if (offsets == null)
				{
					return null;
				}
				
				int[] pos = null;
				if (tokenPositionsGuaranteedContiguous)
				{
					//try get the token position info to speed up assembly of tokens into sorted sequence
					pos = tpv.GetTermPositions(t);
				}
				if (pos == null)
				{
					//tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
					if (unsortedTokens == null)
					{
						unsortedTokens = new System.Collections.ArrayList();
					}
					for (int tp = 0; tp < offsets.Length; tp++)
					{
						unsortedTokens.Add(new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset()));
					}
				}
				else
				{
					//We have positions stored and a guarantee that the token position information is contiguous
					
					// This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
					// creates jumps in position numbers - this code would fail under those circumstances
					
					//tokens stored with positions - can use this to index straight into sorted array
					for (int tp = 0; tp < pos.Length; tp++)
					{
						tokensInOriginalOrder[pos[tp]] = new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset());
					}
				}
			}
			//If the field has been stored without position data we must perform a sort        
			if (unsortedTokens != null)
			{
				tokensInOriginalOrder = (Token[]) unsortedTokens.ToArray(typeof(Token));
				Array.Sort(tokensInOriginalOrder, new AnonymousClassComparator());
			}
			return new StoredTokenStream(tokensInOriginalOrder);
		}
		public static TokenStream GetTokenStream(TermPositionVector tpv)
		{
			//assumes the worst and makes no assumptions about token position sequences.
			return GetTokenStream(tpv, false);
		}
Beispiel #5
0
        /// <summary> Low level api.
        /// Returns a token stream or null if no offset info available in index.
        /// This can be used to feed the highlighter with a pre-parsed token stream
        ///
        /// In my tests the speeds to recreate 1000 token streams using this method are:
        /// - with TermVector offset only data stored - 420  milliseconds
        /// - with TermVector offset AND position data stored - 271 milliseconds
        /// (nb timings for TermVector with position data are based on a tokenizer with contiguous
        /// positions - no overlaps or gaps)
        /// The cost of not using TermPositionVector to store
        /// pre-parsed content and using an analyzer to re-parse the original content:
        /// - reanalyzing the original content - 980 milliseconds
        ///
        /// The re-analyze timings will typically vary depending on -
        /// 1) The complexity of the analyzer code (timings above were using a
        /// stemmer/lowercaser/stopword combo)
        /// 2) The  number of other fields (Lucene reads ALL fields off the disk
        /// when accessing just one document field - can cost dear!)
        /// 3) Use of compression on field storage - could be faster cos of compression (less disk IO)
        /// or slower (more CPU burn) depending on the content.
        ///
        /// </summary>
        /// <param name="">tpv
        /// </param>
        /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
        /// to eek out the last drops of performance, set to true. If in doubt, set to false.
        /// </param>
        public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)
        {
            //an object used to iterate across an array of tokens
            //code to reconstruct the original sequence of Tokens
            System.String[] terms       = tpv.GetTerms();
            int[]           freq        = tpv.GetTermFrequencies();
            int             totalTokens = 0;

            for (int t = 0; t < freq.Length; t++)
            {
                totalTokens += freq[t];
            }
            Token[] tokensInOriginalOrder = new Token[totalTokens];
            System.Collections.ArrayList unsortedTokens = null;
            for (int t = 0; t < freq.Length; t++)
            {
                TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t);
                if (offsets == null)
                {
                    return(null);
                }

                int[] pos = null;
                if (tokenPositionsGuaranteedContiguous)
                {
                    //try get the token position info to speed up assembly of tokens into sorted sequence
                    pos = tpv.GetTermPositions(t);
                }
                if (pos == null)
                {
                    //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
                    if (unsortedTokens == null)
                    {
                        unsortedTokens = new System.Collections.ArrayList();
                    }
                    for (int tp = 0; tp < offsets.Length; tp++)
                    {
                        unsortedTokens.Add(new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset()));
                    }
                }
                else
                {
                    //We have positions stored and a guarantee that the token position information is contiguous

                    // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
                    // creates jumps in position numbers - this code would fail under those circumstances

                    //tokens stored with positions - can use this to index straight into sorted array
                    for (int tp = 0; tp < pos.Length; tp++)
                    {
                        tokensInOriginalOrder[pos[tp]] = new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset());
                    }
                }
            }
            //If the field has been stored without position data we must perform a sort
            if (unsortedTokens != null)
            {
                tokensInOriginalOrder = (Token[])unsortedTokens.ToArray(typeof(Token));
                Array.Sort(tokensInOriginalOrder, new AnonymousClassComparator());
            }
            return(new StoredTokenStream(tokensInOriginalOrder));
        }
Beispiel #6
0
 public static TokenStream GetTokenStream(TermPositionVector tpv)
 {
     //assumes the worst and makes no assumptions about token position sequences.
     return(GetTokenStream(tpv, false));
 }