/// <summary> Low level api. /// Returns a token stream or null if no offset info available in index. /// This can be used to feed the highlighter with a pre-parsed token stream /// /// In my tests the speeds to recreate 1000 token streams using this method are: /// - with TermVector offset only data stored - 420 milliseconds /// - with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// /// The re-analyze timings will typically vary depending on - /// 1) The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// 2) The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// 3) Use of compression on field storage - could be faster cos of compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// /// </summary> /// <param name="">tpv /// </param> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false. /// </param> public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous) { //an object used to iterate across an array of tokens //code to reconstruct the original sequence of Tokens System.String[] terms = tpv.GetTerms(); int[] freq = tpv.GetTermFrequencies(); int totalTokens = 0; for (int t = 0; t < freq.Length; t++) { totalTokens += freq[t]; } Token[] tokensInOriginalOrder = new Token[totalTokens]; System.Collections.ArrayList unsortedTokens = null; for (int t = 0; t < freq.Length; t++) { TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t); if (offsets == null) { return null; } int[] pos = null; if (tokenPositionsGuaranteedContiguous) { //try get the token position info to speed up assembly of tokens into sorted sequence pos = tpv.GetTermPositions(t); } if (pos == null) { //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later if (unsortedTokens == null) { unsortedTokens = new System.Collections.ArrayList(); } for (int tp = 0; tp < offsets.Length; tp++) { unsortedTokens.Add(new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset())); } } else { //We have positions stored and a guarantee that the token position information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or // creates jumps in position numbers - this code would fail under those circumstances //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.Length; tp++) { tokensInOriginalOrder[pos[tp]] = new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset()); } } } //If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = (Token[]) unsortedTokens.ToArray(typeof(Token)); Array.Sort(tokensInOriginalOrder, new AnonymousClassComparator()); } return new StoredTokenStream(tokensInOriginalOrder); }
/// <summary> Low level api. /// Returns a token stream or null if no offset info available in index. /// This can be used to feed the highlighter with a pre-parsed token stream /// /// In my tests the speeds to recreate 1000 token streams using this method are: /// - with TermVector offset only data stored - 420 milliseconds /// - with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// /// The re-analyze timings will typically vary depending on - /// 1) The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// 2) The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// 3) Use of compression on field storage - could be faster cos of compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// /// </summary> /// <param name="">tpv /// </param> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false. /// </param> public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous) { //an object used to iterate across an array of tokens //code to reconstruct the original sequence of Tokens System.String[] terms = tpv.GetTerms(); int[] freq = tpv.GetTermFrequencies(); int totalTokens = 0; for (int t = 0; t < freq.Length; t++) { totalTokens += freq[t]; } Token[] tokensInOriginalOrder = new Token[totalTokens]; System.Collections.ArrayList unsortedTokens = null; for (int t = 0; t < freq.Length; t++) { TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t); if (offsets == null) { return(null); } int[] pos = null; if (tokenPositionsGuaranteedContiguous) { //try get the token position info to speed up assembly of tokens into sorted sequence pos = tpv.GetTermPositions(t); } if (pos == null) { //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later if (unsortedTokens == null) { unsortedTokens = new System.Collections.ArrayList(); } for (int tp = 0; tp < offsets.Length; tp++) { unsortedTokens.Add(new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset())); } } else { //We have positions stored and a guarantee that the token position information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or // creates jumps in position numbers - this code would fail under those circumstances //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.Length; tp++) { tokensInOriginalOrder[pos[tp]] = new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset()); } } } //If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = (Token[])unsortedTokens.ToArray(typeof(Token)); Array.Sort(tokensInOriginalOrder, new AnonymousClassComparator()); } return(new StoredTokenStream(tokensInOriginalOrder)); }