/// <summary> Add a complete document specified by all its term vectors. If document has no /// term vectors, add value for tvx. /// /// </summary> /// <param name="vectors"> /// </param> /// <throws> IOException </throws> public void AddAllDocVectors(TermFreqVector[] vectors) { OpenDocument(); if (vectors != null) { for (int i = 0; i < vectors.Length; i++) { bool storePositionWithTermVector = false; bool storeOffsetWithTermVector = false; try { TermPositionVector tpVector = (TermPositionVector)vectors[i]; if (tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null) { storePositionWithTermVector = true; } if (tpVector.Size() > 0 && tpVector.GetOffsets(0) != null) { storeOffsetWithTermVector = true; } FieldInfo fieldInfo = fieldInfos.FieldInfo(tpVector.GetField()); OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector); for (int j = 0; j < tpVector.Size(); j++) { AddTermInternal(tpVector.GetTerms()[j], tpVector.GetTermFrequencies()[j], tpVector.GetTermPositions(j), tpVector.GetOffsets(j)); } CloseField(); } catch (System.InvalidCastException ignore) { TermFreqVector tfVector = vectors[i]; FieldInfo fieldInfo = fieldInfos.FieldInfo(tfVector.GetField()); OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector); for (int j = 0; j < tfVector.Size(); j++) { AddTermInternal(tfVector.GetTerms()[j], tfVector.GetTermFrequencies()[j], null, null); } CloseField(); } } } CloseDocument(); }
public virtual void TestMixedVectrosVectors() { IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO)); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS)); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_OFFSETS)); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.AddDocument(doc); writer.Close(); searcher = new IndexSearcher(directory); Query query = new TermQuery(new Term("field", "one")); ScoreDoc[] hits = searcher.Search(query, null, 1000).scoreDocs; Assert.AreEqual(1, hits.Length); TermFreqVector[] vector = searcher.reader_ForNUnit.GetTermFreqVectors(hits[0].doc); Assert.IsTrue(vector != null); Assert.IsTrue(vector.Length == 1); TermPositionVector tfv = (TermPositionVector)vector[0]; Assert.IsTrue(tfv.GetField().Equals("field")); System.String[] terms = tfv.GetTerms(); Assert.AreEqual(1, terms.Length); Assert.AreEqual(terms[0], "one"); Assert.AreEqual(5, tfv.GetTermFrequencies()[0]); int[] positions = tfv.GetTermPositions(0); Assert.AreEqual(5, positions.Length); for (int i = 0; i < 5; i++) { Assert.AreEqual(i, positions[i]); } TermVectorOffsetInfo[] offsets = tfv.GetOffsets(0); Assert.AreEqual(5, offsets.Length); for (int i = 0; i < 5; i++) { Assert.AreEqual(4 * i, offsets[i].GetStartOffset()); Assert.AreEqual(4 * i + 3, offsets[i].GetEndOffset()); } }
/// <summary> /// Low level api. /// Returns a token stream or null if no offset info available in index. /// This can be used to feed the highlighter with a pre-parsed token stream /// /// In my tests the speeds to recreate 1000 token streams using this method are: /// - with TermVector offset only data stored - 420 milliseconds /// - with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// /// The re-analyze timings will typically vary depending on - /// 1) The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// 2) The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// 3) Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// </summary> /// <param name="tpv"/> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param> public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous) { //code to reconstruct the original sequence of Tokens String[] terms = tpv.GetTerms(); int[] freq = tpv.GetTermFrequencies(); int totalTokens = freq.Sum(); var tokensInOriginalOrder = new Token[totalTokens]; List<Token> unsortedTokens = null; for (int t = 0; t < freq.Length; t++) { TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t); if (offsets == null) { return null; } int[] pos = null; if (tokenPositionsGuaranteedContiguous) { //try get the token position info to speed up assembly of tokens into sorted sequence pos = tpv.GetTermPositions(t); } if (pos == null) { //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later if (unsortedTokens == null) { unsortedTokens = new List<Token>(); } foreach (TermVectorOffsetInfo t1 in offsets) { var token = new Token(t1.StartOffset, t1.EndOffset); token.SetTermBuffer(terms[t]); unsortedTokens.Add(token); } } else { //We have positions stored and a guarantee that the token position information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or // creates jumps in position numbers - this code would fail under those circumstances //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.Length; tp++) { var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset); tokensInOriginalOrder[pos[tp]] = token; } } } //If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); Array.Sort(tokensInOriginalOrder, (t1, t2) => { if (t1.StartOffset > t2.EndOffset) return 1; if (t1.StartOffset < t2.StartOffset) return -1; return 0; }); } return new StoredTokenStream(tokensInOriginalOrder); }
/// <summary> /// Low level api. /// Returns a token stream or null if no offset info available in index. /// This can be used to feed the highlighter with a pre-parsed token stream /// /// In my tests the speeds to recreate 1000 token streams using this method are: /// - with TermVector offset only data stored - 420 milliseconds /// - with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// /// The re-analyze timings will typically vary depending on - /// 1) The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// 2) The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// 3) Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// </summary> /// <param name="tpv"/> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param> public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous) { //code to reconstruct the original sequence of Tokens String[] terms = tpv.GetTerms(); int[] freq = tpv.GetTermFrequencies(); int totalTokens = freq.Sum(); var tokensInOriginalOrder = new Token[totalTokens]; List <Token> unsortedTokens = null; for (int t = 0; t < freq.Length; t++) { TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t); if (offsets == null) { return(null); } int[] pos = null; if (tokenPositionsGuaranteedContiguous) { //try get the token position info to speed up assembly of tokens into sorted sequence pos = tpv.GetTermPositions(t); } if (pos == null) { //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later if (unsortedTokens == null) { unsortedTokens = new List <Token>(); } foreach (TermVectorOffsetInfo t1 in offsets) { var token = new Token(t1.StartOffset, t1.EndOffset); token.SetTermBuffer(terms[t]); unsortedTokens.Add(token); } } else { //We have positions stored and a guarantee that the token position information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or // creates jumps in position numbers - this code would fail under those circumstances //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.Length; tp++) { var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset); tokensInOriginalOrder[pos[tp]] = token; } } } //If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); Array.Sort(tokensInOriginalOrder, (t1, t2) => { if (t1.StartOffset > t2.EndOffset) { return(1); } if (t1.StartOffset < t2.StartOffset) { return(-1); } return(0); }); } return(new StoredTokenStream(tokensInOriginalOrder)); }