public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field) { TermFreqVector tfv = (TermFreqVector)reader.GetTermFreqVector(docId, field); if (tfv == null) { throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored"); } if (tfv is TermPositionVector) { TermPositionVector tpv = (TermPositionVector)reader.GetTermFreqVector(docId, field); return(GetTokenStream(tpv)); } throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored"); }
private void TestTermVectors() { // check: int numDocs = reader.NumDocs(); long start = 0L; for (int docId = 0; docId < numDocs; docId++) { start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); TermFreqVector[] vectors = reader.GetTermFreqVectors(docId); timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start; // verify vectors result VerifyVectors(vectors, docId); start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); TermFreqVector vector = reader.GetTermFreqVector(docId, "field"); timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start; vectors = new TermFreqVector[1]; vectors[0] = vector; VerifyVectors(vectors, docId); } }
public void Test_SegmentTermVector_IndexOf() { Lucene.Net.Store.RAMDirectory directory = new Lucene.Net.Store.RAMDirectory(); Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.WhitespaceAnalyzer(); Lucene.Net.Index.IndexWriter writer = new Lucene.Net.Index.IndexWriter(directory, analyzer, Lucene.Net.Index.IndexWriter.MaxFieldLength.LIMITED); Lucene.Net.Documents.Document document = new Lucene.Net.Documents.Document(); document.Add(new Lucene.Net.Documents.Field("contents", new System.IO.StreamReader(new System.IO.MemoryStream(System.Text.Encoding.ASCII.GetBytes("a_ a0"))), Lucene.Net.Documents.Field.TermVector.WITH_OFFSETS)); writer.AddDocument(document); Lucene.Net.Index.IndexReader reader = writer.GetReader(); Lucene.Net.Index.TermPositionVector tpv = reader.GetTermFreqVector(0, "contents") as Lucene.Net.Index.TermPositionVector; //Console.WriteLine("tpv: " + tpv); int index = tpv.IndexOf("a_"); Assert.AreEqual(index, 1, "See the issue: LUCENENET-183"); }
/// <summary> A convenience method that tries a number of approaches to getting a token stream. /// The cost of finding there are no termVectors in the index is minimal (1000 invocations still /// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable /// </summary> /// <param name="">reader</param> /// <param name="">docId</param> /// <param name="">field</param> /// <param name="">analyzer</param> /// <returns> null if field not stored correctly /// </returns> /// <throws> IOException </throws> public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, string field, Analyzer analyzer) { TokenStream ts = null; TermFreqVector tfv = (TermFreqVector) reader.GetTermFreqVector(docId, field); if (tfv != null) { if (tfv is TermPositionVector) { ts = GetTokenStream((TermPositionVector) tfv); } } //No token info stored so fall back to analyzing raw content if (ts == null) { ts = GetTokenStream(reader, docId, field, analyzer); } return ts; }
/// <summary> A convenience method that tries a number of approaches to getting a token stream. /// The cost of finding there are no termVectors in the index is minimal (1000 invocations still /// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable /// </summary> /// <param name="">reader /// </param> /// <param name="">docId /// </param> /// <param name="">field /// </param> /// <param name="">analyzer /// </param> /// <returns> null if field not stored correctly /// </returns> /// <throws> IOException </throws> public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, System.String field, Analyzer analyzer) { TokenStream ts = null; TermFreqVector tfv = (TermFreqVector)reader.GetTermFreqVector(docId, field); if (tfv != null) { if (tfv is TermPositionVector) { ts = GetTokenStream((TermPositionVector)tfv); } } //No token info stored so fall back to analyzing raw content if (ts == null) { ts = GetTokenStream(reader, docId, field, analyzer); } return(ts); }
public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field) { TermFreqVector tfv = (TermFreqVector) reader.GetTermFreqVector(docId, field); if (tfv == null) { throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored"); } if (tfv is TermPositionVector) { TermPositionVector tpv = (TermPositionVector) reader.GetTermFreqVector(docId, field); return GetTokenStream(tpv); } throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored"); }