示例#1
0
        /// <summary> Add a complete document specified by all its term vectors. If document has no
        /// term vectors, add value for tvx.
        ///
        /// </summary>
        /// <param name="vectors">
        /// </param>
        /// <throws>  IOException </throws>
        public void  AddAllDocVectors(TermFreqVector[] vectors)
        {
            OpenDocument();

            if (vectors != null)
            {
                for (int i = 0; i < vectors.Length; i++)
                {
                    bool storePositionWithTermVector = false;
                    bool storeOffsetWithTermVector   = false;

                    try
                    {
                        TermPositionVector tpVector = (TermPositionVector)vectors[i];

                        if (tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null)
                        {
                            storePositionWithTermVector = true;
                        }
                        if (tpVector.Size() > 0 && tpVector.GetOffsets(0) != null)
                        {
                            storeOffsetWithTermVector = true;
                        }

                        FieldInfo fieldInfo = fieldInfos.FieldInfo(tpVector.GetField());
                        OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);

                        for (int j = 0; j < tpVector.Size(); j++)
                        {
                            AddTermInternal(tpVector.GetTerms()[j], tpVector.GetTermFrequencies()[j], tpVector.GetTermPositions(j), tpVector.GetOffsets(j));
                        }

                        CloseField();
                    }
                    catch (System.InvalidCastException ignore)
                    {
                        TermFreqVector tfVector = vectors[i];

                        FieldInfo fieldInfo = fieldInfos.FieldInfo(tfVector.GetField());
                        OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);

                        for (int j = 0; j < tfVector.Size(); j++)
                        {
                            AddTermInternal(tfVector.GetTerms()[j], tfVector.GetTermFrequencies()[j], null, null);
                        }

                        CloseField();
                    }
                }
            }

            CloseDocument();
        }
示例#2
0
        public virtual void  TestMixedVectrosVectors()
        {
            IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
            Document    doc    = new Document();

            doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
            doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
            doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS));
            doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_OFFSETS));
            doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
            writer.AddDocument(doc);
            writer.Close();

            searcher = new IndexSearcher(directory);

            Query query = new TermQuery(new Term("field", "one"));

            ScoreDoc[] hits = searcher.Search(query, null, 1000).scoreDocs;
            Assert.AreEqual(1, hits.Length);

            TermFreqVector[] vector = searcher.reader_ForNUnit.GetTermFreqVectors(hits[0].doc);
            Assert.IsTrue(vector != null);
            Assert.IsTrue(vector.Length == 1);
            TermPositionVector tfv = (TermPositionVector)vector[0];

            Assert.IsTrue(tfv.GetField().Equals("field"));
            System.String[] terms = tfv.GetTerms();
            Assert.AreEqual(1, terms.Length);
            Assert.AreEqual(terms[0], "one");
            Assert.AreEqual(5, tfv.GetTermFrequencies()[0]);

            int[] positions = tfv.GetTermPositions(0);
            Assert.AreEqual(5, positions.Length);
            for (int i = 0; i < 5; i++)
            {
                Assert.AreEqual(i, positions[i]);
            }
            TermVectorOffsetInfo[] offsets = tfv.GetOffsets(0);
            Assert.AreEqual(5, offsets.Length);
            for (int i = 0; i < 5; i++)
            {
                Assert.AreEqual(4 * i, offsets[i].GetStartOffset());
                Assert.AreEqual(4 * i + 3, offsets[i].GetEndOffset());
            }
        }
示例#3
0
        /// <summary>
        /// Low level api.
        /// Returns a token stream or null if no offset info available in index.
        /// This can be used to feed the highlighter with a pre-parsed token stream 
        /// 
        /// In my tests the speeds to recreate 1000 token streams using this method are:
        /// - with TermVector offset only data stored - 420  milliseconds 
        /// - with TermVector offset AND position data stored - 271 milliseconds
        ///  (nb timings for TermVector with position data are based on a tokenizer with contiguous
        ///  positions - no overlaps or gaps)
        /// The cost of not using TermPositionVector to store
        /// pre-parsed content and using an analyzer to re-parse the original content: 
        /// - reanalyzing the original content - 980 milliseconds
        /// 
        /// The re-analyze timings will typically vary depending on -
        ///     1) The complexity of the analyzer code (timings above were using a 
        ///        stemmer/lowercaser/stopword combo)
        ///  2) The  number of other fields (Lucene reads ALL fields off the disk 
        ///     when accessing just one document field - can cost dear!)
        ///  3) Use of compression on field storage - could be faster due to compression (less disk IO)
        ///     or slower (more CPU burn) depending on the content.
        /// </summary>
        /// <param name="tpv"/>
        /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
        /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param>
        public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)
        {
            //code to reconstruct the original sequence of Tokens
            String[] terms = tpv.GetTerms();
            int[] freq = tpv.GetTermFrequencies();

            int totalTokens = freq.Sum();

            var tokensInOriginalOrder = new Token[totalTokens];
            List<Token> unsortedTokens = null;
            for (int t = 0; t < freq.Length; t++)
            {
                TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t);
                if (offsets == null)
                {
                    return null;
                }

                int[] pos = null;
                if (tokenPositionsGuaranteedContiguous)
                {
                    //try get the token position info to speed up assembly of tokens into sorted sequence
                    pos = tpv.GetTermPositions(t);
                }
                if (pos == null)
                {
                    //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
                    if (unsortedTokens == null)
                    {
                        unsortedTokens = new List<Token>();
                    }

                    foreach (TermVectorOffsetInfo t1 in offsets)
                    {
                        var token = new Token(t1.StartOffset, t1.EndOffset);
                        token.SetTermBuffer(terms[t]);
                        unsortedTokens.Add(token);
                    }
                }
                else
                {
                    //We have positions stored and a guarantee that the token position information is contiguous

                    // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
                    // creates jumps in position numbers - this code would fail under those circumstances

                    //tokens stored with positions - can use this to index straight into sorted array
                    for (int tp = 0; tp < pos.Length; tp++)
                    {
                        var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset);
                        tokensInOriginalOrder[pos[tp]] = token;
                    }
                }
            }
            //If the field has been stored without position data we must perform a sort
            if (unsortedTokens != null)
            {
                tokensInOriginalOrder = unsortedTokens.ToArray();
                Array.Sort(tokensInOriginalOrder, (t1, t2) =>
                                                      {
                                                          if (t1.StartOffset > t2.EndOffset)
                                                              return 1;
                                                          if (t1.StartOffset < t2.StartOffset)
                                                              return -1;
                                                          return 0;
                                                      });
            }
            return new StoredTokenStream(tokensInOriginalOrder);
        }
示例#4
0
        /// <summary>
        /// Low level api.
        /// Returns a token stream or null if no offset info available in index.
        /// This can be used to feed the highlighter with a pre-parsed token stream
        ///
        /// In my tests the speeds to recreate 1000 token streams using this method are:
        /// - with TermVector offset only data stored - 420  milliseconds
        /// - with TermVector offset AND position data stored - 271 milliseconds
        ///  (nb timings for TermVector with position data are based on a tokenizer with contiguous
        ///  positions - no overlaps or gaps)
        /// The cost of not using TermPositionVector to store
        /// pre-parsed content and using an analyzer to re-parse the original content:
        /// - reanalyzing the original content - 980 milliseconds
        ///
        /// The re-analyze timings will typically vary depending on -
        ///     1) The complexity of the analyzer code (timings above were using a
        ///        stemmer/lowercaser/stopword combo)
        ///  2) The  number of other fields (Lucene reads ALL fields off the disk
        ///     when accessing just one document field - can cost dear!)
        ///  3) Use of compression on field storage - could be faster due to compression (less disk IO)
        ///     or slower (more CPU burn) depending on the content.
        /// </summary>
        /// <param name="tpv"/>
        /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
        /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param>
        public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)
        {
            //code to reconstruct the original sequence of Tokens
            String[] terms = tpv.GetTerms();
            int[]    freq  = tpv.GetTermFrequencies();

            int totalTokens = freq.Sum();

            var          tokensInOriginalOrder = new Token[totalTokens];
            List <Token> unsortedTokens        = null;

            for (int t = 0; t < freq.Length; t++)
            {
                TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t);
                if (offsets == null)
                {
                    return(null);
                }

                int[] pos = null;
                if (tokenPositionsGuaranteedContiguous)
                {
                    //try get the token position info to speed up assembly of tokens into sorted sequence
                    pos = tpv.GetTermPositions(t);
                }
                if (pos == null)
                {
                    //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
                    if (unsortedTokens == null)
                    {
                        unsortedTokens = new List <Token>();
                    }

                    foreach (TermVectorOffsetInfo t1 in offsets)
                    {
                        var token = new Token(t1.StartOffset, t1.EndOffset);
                        token.SetTermBuffer(terms[t]);
                        unsortedTokens.Add(token);
                    }
                }
                else
                {
                    //We have positions stored and a guarantee that the token position information is contiguous

                    // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
                    // creates jumps in position numbers - this code would fail under those circumstances

                    //tokens stored with positions - can use this to index straight into sorted array
                    for (int tp = 0; tp < pos.Length; tp++)
                    {
                        var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset);
                        tokensInOriginalOrder[pos[tp]] = token;
                    }
                }
            }
            //If the field has been stored without position data we must perform a sort
            if (unsortedTokens != null)
            {
                tokensInOriginalOrder = unsortedTokens.ToArray();
                Array.Sort(tokensInOriginalOrder, (t1, t2) =>
                {
                    if (t1.StartOffset > t2.EndOffset)
                    {
                        return(1);
                    }
                    if (t1.StartOffset < t2.StartOffset)
                    {
                        return(-1);
                    }
                    return(0);
                });
            }
            return(new StoredTokenStream(tokensInOriginalOrder));
        }