public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field) { TermFreqVector tfv = (TermFreqVector)reader.GetTermFreqVector(docId, field); if (tfv == null) { throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored"); } if (tfv is TermPositionVector) { TermPositionVector tpv = (TermPositionVector)reader.GetTermFreqVector(docId, field); return(GetTokenStream(tpv)); } throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored"); }
public void Test_SegmentTermVector_IndexOf() { Lucene.Net.Store.RAMDirectory directory = new Lucene.Net.Store.RAMDirectory(); Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.WhitespaceAnalyzer(); Lucene.Net.Index.IndexWriter writer = new Lucene.Net.Index.IndexWriter(directory, analyzer, Lucene.Net.Index.IndexWriter.MaxFieldLength.LIMITED); Lucene.Net.Documents.Document document = new Lucene.Net.Documents.Document(); document.Add(new Lucene.Net.Documents.Field("contents", new System.IO.StreamReader(new System.IO.MemoryStream(System.Text.Encoding.ASCII.GetBytes("a_ a0"))), Lucene.Net.Documents.Field.TermVector.WITH_OFFSETS)); writer.AddDocument(document); Lucene.Net.Index.IndexReader reader = writer.GetReader(); Lucene.Net.Index.TermPositionVector tpv = reader.GetTermFreqVector(0, "contents") as Lucene.Net.Index.TermPositionVector; //Console.WriteLine("tpv: " + tpv); int index = tpv.IndexOf("a_"); Assert.AreEqual(index, 1, "See the issue: LUCENENET-183"); }
/// <summary> Low level api. /// Returns a token stream or null if no offset info available in index. /// This can be used to feed the highlighter with a pre-parsed token stream /// /// In my tests the speeds to recreate 1000 token streams using this method are: /// - with TermVector offset only data stored - 420 milliseconds /// - with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// /// The re-analyze timings will typically vary depending on - /// 1) The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// 2) The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// 3) Use of compression on field storage - could be faster cos of compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// /// </summary> /// <param name="">tpv /// </param> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false. /// </param> public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous) { //an object used to iterate across an array of tokens //code to reconstruct the original sequence of Tokens System.String[] terms = tpv.GetTerms(); int[] freq = tpv.GetTermFrequencies(); int totalTokens = 0; for (int t = 0; t < freq.Length; t++) { totalTokens += freq[t]; } Token[] tokensInOriginalOrder = new Token[totalTokens]; System.Collections.ArrayList unsortedTokens = null; for (int t = 0; t < freq.Length; t++) { TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t); if (offsets == null) { return null; } int[] pos = null; if (tokenPositionsGuaranteedContiguous) { //try get the token position info to speed up assembly of tokens into sorted sequence pos = tpv.GetTermPositions(t); } if (pos == null) { //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later if (unsortedTokens == null) { unsortedTokens = new System.Collections.ArrayList(); } for (int tp = 0; tp < offsets.Length; tp++) { unsortedTokens.Add(new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset())); } } else { //We have positions stored and a guarantee that the token position information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or // creates jumps in position numbers - this code would fail under those circumstances //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.Length; tp++) { tokensInOriginalOrder[pos[tp]] = new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset()); } } } //If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = (Token[]) unsortedTokens.ToArray(typeof(Token)); Array.Sort(tokensInOriginalOrder, new AnonymousClassComparator()); } return new StoredTokenStream(tokensInOriginalOrder); }
public static TokenStream GetTokenStream(TermPositionVector tpv) { //assumes the worst and makes no assumptions about token position sequences. return GetTokenStream(tpv, false); }
/// <summary> Low level api. /// Returns a token stream or null if no offset info available in index. /// This can be used to feed the highlighter with a pre-parsed token stream /// /// In my tests the speeds to recreate 1000 token streams using this method are: /// - with TermVector offset only data stored - 420 milliseconds /// - with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// /// The re-analyze timings will typically vary depending on - /// 1) The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// 2) The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// 3) Use of compression on field storage - could be faster cos of compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// /// </summary> /// <param name="">tpv /// </param> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false. /// </param> public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous) { //an object used to iterate across an array of tokens //code to reconstruct the original sequence of Tokens System.String[] terms = tpv.GetTerms(); int[] freq = tpv.GetTermFrequencies(); int totalTokens = 0; for (int t = 0; t < freq.Length; t++) { totalTokens += freq[t]; } Token[] tokensInOriginalOrder = new Token[totalTokens]; System.Collections.ArrayList unsortedTokens = null; for (int t = 0; t < freq.Length; t++) { TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t); if (offsets == null) { return(null); } int[] pos = null; if (tokenPositionsGuaranteedContiguous) { //try get the token position info to speed up assembly of tokens into sorted sequence pos = tpv.GetTermPositions(t); } if (pos == null) { //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later if (unsortedTokens == null) { unsortedTokens = new System.Collections.ArrayList(); } for (int tp = 0; tp < offsets.Length; tp++) { unsortedTokens.Add(new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset())); } } else { //We have positions stored and a guarantee that the token position information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or // creates jumps in position numbers - this code would fail under those circumstances //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.Length; tp++) { tokensInOriginalOrder[pos[tp]] = new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset()); } } } //If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = (Token[])unsortedTokens.ToArray(typeof(Token)); Array.Sort(tokensInOriginalOrder, new AnonymousClassComparator()); } return(new StoredTokenStream(tokensInOriginalOrder)); }
public static TokenStream GetTokenStream(TermPositionVector tpv) { //assumes the worst and makes no assumptions about token position sequences. return(GetTokenStream(tpv, false)); }