Beispiel #1
0
        public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field)
        {
            TermFreqVector tfv = (TermFreqVector)reader.GetTermFreqVector(docId, field);

            if (tfv == null)
            {
                throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored");
            }
            if (tfv is TermPositionVector)
            {
                TermPositionVector tpv = (TermPositionVector)reader.GetTermFreqVector(docId, field);
                return(GetTokenStream(tpv));
            }
            throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored");
        }
        private void  TestTermVectors()
        {
            // check:
            int  numDocs = reader.NumDocs();
            long start   = 0L;

            for (int docId = 0; docId < numDocs; docId++)
            {
                start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                TermFreqVector[] vectors = reader.GetTermFreqVectors(docId);
                timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start;

                // verify vectors result
                VerifyVectors(vectors, docId);

                start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                TermFreqVector vector = reader.GetTermFreqVector(docId, "field");
                timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start;

                vectors    = new TermFreqVector[1];
                vectors[0] = vector;

                VerifyVectors(vectors, docId);
            }
        }
Beispiel #3
0
        public void Test_SegmentTermVector_IndexOf()
        {
            Lucene.Net.Store.RAMDirectory directory = new Lucene.Net.Store.RAMDirectory();
            Lucene.Net.Analysis.Analyzer  analyzer  = new Lucene.Net.Analysis.WhitespaceAnalyzer();
            Lucene.Net.Index.IndexWriter  writer    = new Lucene.Net.Index.IndexWriter(directory, analyzer, Lucene.Net.Index.IndexWriter.MaxFieldLength.LIMITED);
            Lucene.Net.Documents.Document document  = new Lucene.Net.Documents.Document();
            document.Add(new Lucene.Net.Documents.Field("contents", new System.IO.StreamReader(new System.IO.MemoryStream(System.Text.Encoding.ASCII.GetBytes("a_ a0"))), Lucene.Net.Documents.Field.TermVector.WITH_OFFSETS));
            writer.AddDocument(document);
            Lucene.Net.Index.IndexReader        reader = writer.GetReader();
            Lucene.Net.Index.TermPositionVector tpv    = reader.GetTermFreqVector(0, "contents") as Lucene.Net.Index.TermPositionVector;
            //Console.WriteLine("tpv: " + tpv);
            int index = tpv.IndexOf("a_");

            Assert.AreEqual(index, 1, "See the issue: LUCENENET-183");
        }
Beispiel #4
0
        /// <summary> A convenience method that tries a number of approaches to getting a token stream.
        /// The cost of finding there are no termVectors in the index is minimal (1000 invocations still 
        /// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
        /// </summary>
        /// <param name="">reader</param>
        /// <param name="">docId</param>
        /// <param name="">field</param>
        /// <param name="">analyzer</param>
        /// <returns> null if field not stored correctly 
        /// </returns>
        /// <throws>  IOException </throws>
        public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, string field, Analyzer analyzer)
        {
            TokenStream ts = null;

            TermFreqVector tfv = (TermFreqVector) reader.GetTermFreqVector(docId, field);
            if (tfv != null)
            {
                if (tfv is TermPositionVector)
                {
                    ts = GetTokenStream((TermPositionVector) tfv);
                }
            }
            //No token info stored so fall back to analyzing raw content
            if (ts == null)
            {
                ts = GetTokenStream(reader, docId, field, analyzer);
            }
            return ts;
        }
Beispiel #5
0
        /// <summary> A convenience method that tries a number of approaches to getting a token stream.
        /// The cost of finding there are no termVectors in the index is minimal (1000 invocations still
        /// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
        /// </summary>
        /// <param name="">reader
        /// </param>
        /// <param name="">docId
        /// </param>
        /// <param name="">field
        /// </param>
        /// <param name="">analyzer
        /// </param>
        /// <returns> null if field not stored correctly
        /// </returns>
        /// <throws>  IOException </throws>
        public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, System.String field, Analyzer analyzer)
        {
            TokenStream ts = null;

            TermFreqVector tfv = (TermFreqVector)reader.GetTermFreqVector(docId, field);

            if (tfv != null)
            {
                if (tfv is TermPositionVector)
                {
                    ts = GetTokenStream((TermPositionVector)tfv);
                }
            }
            //No token info stored so fall back to analyzing raw content
            if (ts == null)
            {
                ts = GetTokenStream(reader, docId, field, analyzer);
            }
            return(ts);
        }
		public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field)
		{
			TermFreqVector tfv = (TermFreqVector) reader.GetTermFreqVector(docId, field);
			if (tfv == null)
			{
				throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored");
			}
			if (tfv is TermPositionVector)
			{
				TermPositionVector tpv = (TermPositionVector) reader.GetTermFreqVector(docId, field);
				return GetTokenStream(tpv);
			}
			throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored");
		}