TermPositionVector.GetTerms C# (CSharp) 코드 예제들

예제 #1

0

파일 보기

파일: Searcher.cs 프로젝트: jmyersmsft/NuGet.Services.Metadata

        private static JArray GetTerms(IndexSearcher searcher, int doc, string field)
        {
            TermPositionVector termPositionVector = (TermPositionVector)searcher.IndexReader.GetTermFreqVector(doc, field);

            if (termPositionVector == null)
            {
                return(null);
            }

            JArray array = new JArray();

            for (int i = 0; i < termPositionVector.GetTerms().Length; i++)
            {
                string term = termPositionVector.GetTerms()[i];

                int[] positions = termPositionVector.GetTermPositions(i);

                string offset = "";
                foreach (TermVectorOffsetInfo offsetInfo in termPositionVector.GetOffsets(i))
                {
                    offset += string.Format("({0},{1})", offsetInfo.StartOffset, offsetInfo.EndOffset);
                }

                array.Add(term + " " + offset);
            }
            return(array);
        }

예제 #2

0

파일 보기

파일: TestTermVectorsReader.cs 프로젝트: mundher/lucene.net

        public virtual void  TestOffsetReader()
        {
            TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);

            Assert.IsTrue(reader != null);
            TermPositionVector vector = (TermPositionVector)reader.Get(0, testFields[0]);

            Assert.IsTrue(vector != null);
            System.String[] terms = vector.GetTerms();
            Assert.IsTrue(terms != null);
            Assert.IsTrue(terms.Length == testTerms.Length);
            for (int i = 0; i < terms.Length; i++)
            {
                System.String term = terms[i];
                //System.out.println("Term: " + term);
                Assert.IsTrue(term.Equals(testTerms[i]));
                int[] positions = vector.GetTermPositions(i);
                Assert.IsTrue(positions != null);
                Assert.IsTrue(positions.Length == this.positions[i].Length);
                for (int j = 0; j < positions.Length; j++)
                {
                    int position = positions[j];
                    Assert.IsTrue(position == this.positions[i][j]);
                }
                TermVectorOffsetInfo[] offset = vector.GetOffsets(i);
                Assert.IsTrue(offset != null);
                Assert.IsTrue(offset.Length == this.offsets[i].Length);
                for (int j = 0; j < offset.Length; j++)
                {
                    TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
                    Assert.IsTrue(termVectorOffsetInfo.Equals(offsets[i][j]));
                }
            }
        }

예제 #3

0

파일 보기

파일: TestTermVectors.cs 프로젝트: jlundstocholm/ravendb

        public virtual void  TestTermVectorsFieldOrder()
        {
            Directory   dir    = new MockRAMDirectory();
            IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
            Document    doc    = new Document();

            doc.Add(new Field("c", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
            doc.Add(new Field("a", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
            doc.Add(new Field("b", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
            doc.Add(new Field("x", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
            writer.AddDocument(doc);
            writer.Close();
            IndexReader reader = IndexReader.Open(dir);

            TermFreqVector[] v = reader.GetTermFreqVectors(0);
            Assert.AreEqual(4, v.Length);
            System.String[] expectedFields    = new System.String[] { "a", "b", "c", "x" };
            int[]           expectedPositions = new int[] { 1, 2, 0 };
            for (int i = 0; i < v.Length; i++)
            {
                TermPositionVector posVec = (TermPositionVector)v[i];
                Assert.AreEqual(expectedFields[i], posVec.GetField());
                System.String[] terms = posVec.GetTerms();
                Assert.AreEqual(3, terms.Length);
                Assert.AreEqual("content", terms[0]);
                Assert.AreEqual("here", terms[1]);
                Assert.AreEqual("some", terms[2]);
                for (int j = 0; j < 3; j++)
                {
                    int[] positions = posVec.GetTermPositions(j);
                    Assert.AreEqual(1, positions.Length);
                    Assert.AreEqual(expectedPositions[j], positions[0]);
                }
            }
        }

예제 #4

0

파일 보기

파일: Documents.cs 프로젝트: jjwangnlp/cnopenblog

        /// <summary>
        /// 获取当前搜索结果文档集合中指定索引位置的文档
        /// </summary>
        /// <param name="index">文档在搜索结果文档集合中的从零开始的索引</param>
        /// <returns></returns>
        public Document this[int index]
        {
            get
            {
                int offset = 0;

                //条件在索引中的位置向量
                TermPositionVector termPositionVector = (TermPositionVector)this.reader.GetTermFreqVector(this._hits.Id(index), "body");

                //如果存在位置向量
                if (termPositionVector != null)
                {
                    int pos = -1;

                    for (int i = 0; i < terms.Length; i++)
                    {
                        //第一个命中的关键字在索引中的位置
                        pos = System.Array.IndexOf <string>(termPositionVector.GetTerms(), terms[i]);
                        if (pos > -1)
                        {
                            break;
                        }
                    }

                    //如果在索引中找到对应关键字则取出关键字在正文中的偏移量
                    if (pos > -1)
                    {
                        TermVectorOffsetInfo[] tvois = termPositionVector.GetOffsets(pos);
                        offset = tvois[0].GetStartOffset();
                    }
                }
                return(new Hit(this._hits.Doc(index), offset));
            }
        }

예제 #5

0

파일 보기

        public FieldTermStack(IndexReader reader, int docId, String fieldName, FieldQuery fieldQuery)
        {
            this.fieldName = fieldName;

            TermFreqVector tfv = reader.GetTermFreqVector(docId, fieldName);

            if (tfv == null)
            {
                return;              // just return to make null snippets
            }
            TermPositionVector tpv = null;

            try
            {
                tpv = (TermPositionVector)tfv;
            }
            catch (InvalidCastException e)
            {
                return; // just return to make null snippets
            }

            List <String> termSet = fieldQuery.getTermSet(fieldName);

            // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
            if (termSet == null)
            {
                return;
            }

            foreach (String term in tpv.GetTerms())
            {
                if (!termSet.Contains(term))
                {
                    continue;
                }
                int index = tpv.IndexOf(term);
                TermVectorOffsetInfo[] tvois = tpv.GetOffsets(index);
                if (tvois == null)
                {
                    return;                // just return to make null snippets
                }
                int[] poss = tpv.GetTermPositions(index);
                if (poss == null)
                {
                    return;               // just return to make null snippets
                }
                for (int i = 0; i < tvois.Length; i++)
                {
                    termList.AddLast(new TermInfo(term, tvois[i].GetStartOffset(), tvois[i].GetEndOffset(), poss[i]));
                }
            }

            // sort by position
            //Collections.sort(termList);
            Sort(termList);
        }

예제 #6

0

파일 보기

        /// <summary> Add a complete document specified by all its term vectors. If document has no
        /// term vectors, add value for tvx.
        ///
        /// </summary>
        /// <param name="vectors">
        /// </param>
        /// <throws>  IOException </throws>
        public void  AddAllDocVectors(TermFreqVector[] vectors)
        {
            OpenDocument();

            if (vectors != null)
            {
                for (int i = 0; i < vectors.Length; i++)
                {
                    bool storePositionWithTermVector = false;
                    bool storeOffsetWithTermVector   = false;

                    try
                    {
                        TermPositionVector tpVector = (TermPositionVector)vectors[i];

                        if (tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null)
                        {
                            storePositionWithTermVector = true;
                        }
                        if (tpVector.Size() > 0 && tpVector.GetOffsets(0) != null)
                        {
                            storeOffsetWithTermVector = true;
                        }

                        FieldInfo fieldInfo = fieldInfos.FieldInfo(tpVector.GetField());
                        OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);

                        for (int j = 0; j < tpVector.Size(); j++)
                        {
                            AddTermInternal(tpVector.GetTerms()[j], tpVector.GetTermFrequencies()[j], tpVector.GetTermPositions(j), tpVector.GetOffsets(j));
                        }

                        CloseField();
                    }
                    catch (System.InvalidCastException ignore)
                    {
                        TermFreqVector tfVector = vectors[i];

                        FieldInfo fieldInfo = fieldInfos.FieldInfo(tfVector.GetField());
                        OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);

                        for (int j = 0; j < tfVector.Size(); j++)
                        {
                            AddTermInternal(tfVector.GetTerms()[j], tfVector.GetTermFrequencies()[j], null, null);
                        }

                        CloseField();
                    }
                }
            }

            CloseDocument();
        }

예제 #7

0

파일 보기

파일: TestTermVectors.cs 프로젝트: jlundstocholm/ravendb

        public virtual void  TestMixedVectrosVectors()
        {
            IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
            Document    doc    = new Document();

            doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
            doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
            doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS));
            doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_OFFSETS));
            doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
            writer.AddDocument(doc);
            writer.Close();

            searcher = new IndexSearcher(directory);

            Query query = new TermQuery(new Term("field", "one"));

            ScoreDoc[] hits = searcher.Search(query, null, 1000).scoreDocs;
            Assert.AreEqual(1, hits.Length);

            TermFreqVector[] vector = searcher.reader_ForNUnit.GetTermFreqVectors(hits[0].doc);
            Assert.IsTrue(vector != null);
            Assert.IsTrue(vector.Length == 1);
            TermPositionVector tfv = (TermPositionVector)vector[0];

            Assert.IsTrue(tfv.GetField().Equals("field"));
            System.String[] terms = tfv.GetTerms();
            Assert.AreEqual(1, terms.Length);
            Assert.AreEqual(terms[0], "one");
            Assert.AreEqual(5, tfv.GetTermFrequencies()[0]);

            int[] positions = tfv.GetTermPositions(0);
            Assert.AreEqual(5, positions.Length);
            for (int i = 0; i < 5; i++)
            {
                Assert.AreEqual(i, positions[i]);
            }
            TermVectorOffsetInfo[] offsets = tfv.GetOffsets(0);
            Assert.AreEqual(5, offsets.Length);
            for (int i = 0; i < 5; i++)
            {
                Assert.AreEqual(4 * i, offsets[i].GetStartOffset());
                Assert.AreEqual(4 * i + 3, offsets[i].GetEndOffset());
            }
        }

예제 #8

0

파일 보기

파일: TestTermVectors.cs 프로젝트: jlundstocholm/ravendb

        public virtual void  TestTermPositionVectors()
        {
            Query query = new TermQuery(new Term("field", "zero"));

            try
            {
                ScoreDoc[] hits = searcher.Search(query, null, 1000).scoreDocs;
                Assert.AreEqual(1, hits.Length);

                for (int i = 0; i < hits.Length; i++)
                {
                    TermFreqVector[] vector = searcher.reader_ForNUnit.GetTermFreqVectors(hits[i].doc);
                    Assert.IsTrue(vector != null);
                    Assert.IsTrue(vector.Length == 1);

                    bool shouldBePosVector = (hits[i].doc % 2 == 0)?true:false;
                    Assert.IsTrue((shouldBePosVector == false) || (shouldBePosVector == true && (vector[0] is TermPositionVector == true)));

                    bool shouldBeOffVector = (hits[i].doc % 3 == 0)?true:false;
                    Assert.IsTrue((shouldBeOffVector == false) || (shouldBeOffVector == true && (vector[0] is TermPositionVector == true)));

                    if (shouldBePosVector || shouldBeOffVector)
                    {
                        TermPositionVector posVec = (TermPositionVector)vector[0];
                        System.String[]    terms  = posVec.GetTerms();
                        Assert.IsTrue(terms != null && terms.Length > 0);

                        for (int j = 0; j < terms.Length; j++)
                        {
                            int[] positions = posVec.GetTermPositions(j);
                            TermVectorOffsetInfo[] offsets = posVec.GetOffsets(j);

                            if (shouldBePosVector)
                            {
                                Assert.IsTrue(positions != null);
                                Assert.IsTrue(positions.Length > 0);
                            }
                            else
                            {
                                Assert.IsTrue(positions == null);
                            }

                            if (shouldBeOffVector)
                            {
                                Assert.IsTrue(offsets != null);
                                Assert.IsTrue(offsets.Length > 0);
                            }
                            else
                            {
                                Assert.IsTrue(offsets == null);
                            }
                        }
                    }
                    else
                    {
                        try
                        {
                            TermPositionVector posVec = (TermPositionVector)vector[0];
                            Assert.IsTrue(false);
                        }
                        catch (System.InvalidCastException ignore)
                        {
                            TermFreqVector  freqVec = vector[0];
                            System.String[] terms   = freqVec.GetTerms();
                            Assert.IsTrue(terms != null && terms.Length > 0);
                        }
                    }
                }
            }
            catch (System.IO.IOException e)
            {
                Assert.IsTrue(false);
            }
        }

예제 #9

0

파일 보기

파일: TokenSources.cs 프로젝트: Cefa68000/lucenenet

        /// <summary>
        /// Low level api.
        /// Returns a token stream or null if no offset info available in index.
        /// This can be used to feed the highlighter with a pre-parsed token stream 
        /// 
        /// In my tests the speeds to recreate 1000 token streams using this method are:
        /// - with TermVector offset only data stored - 420  milliseconds 
        /// - with TermVector offset AND position data stored - 271 milliseconds
        ///  (nb timings for TermVector with position data are based on a tokenizer with contiguous
        ///  positions - no overlaps or gaps)
        /// The cost of not using TermPositionVector to store
        /// pre-parsed content and using an analyzer to re-parse the original content: 
        /// - reanalyzing the original content - 980 milliseconds
        /// 
        /// The re-analyze timings will typically vary depending on -
        ///     1) The complexity of the analyzer code (timings above were using a 
        ///        stemmer/lowercaser/stopword combo)
        ///  2) The  number of other fields (Lucene reads ALL fields off the disk 
        ///     when accessing just one document field - can cost dear!)
        ///  3) Use of compression on field storage - could be faster due to compression (less disk IO)
        ///     or slower (more CPU burn) depending on the content.
        /// </summary>
        /// <param name="tpv"/>
        /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
        /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param>
        public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)
        {
            //code to reconstruct the original sequence of Tokens
            String[] terms = tpv.GetTerms();
            int[] freq = tpv.GetTermFrequencies();

            int totalTokens = freq.Sum();

            var tokensInOriginalOrder = new Token[totalTokens];
            List<Token> unsortedTokens = null;
            for (int t = 0; t < freq.Length; t++)
            {
                TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t);
                if (offsets == null)
                {
                    return null;
                }

                int[] pos = null;
                if (tokenPositionsGuaranteedContiguous)
                {
                    //try get the token position info to speed up assembly of tokens into sorted sequence
                    pos = tpv.GetTermPositions(t);
                }
                if (pos == null)
                {
                    //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
                    if (unsortedTokens == null)
                    {
                        unsortedTokens = new List<Token>();
                    }

                    foreach (TermVectorOffsetInfo t1 in offsets)
                    {
                        var token = new Token(t1.StartOffset, t1.EndOffset);
                        token.SetTermBuffer(terms[t]);
                        unsortedTokens.Add(token);
                    }
                }
                else
                {
                    //We have positions stored and a guarantee that the token position information is contiguous

                    // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
                    // creates jumps in position numbers - this code would fail under those circumstances

                    //tokens stored with positions - can use this to index straight into sorted array
                    for (int tp = 0; tp < pos.Length; tp++)
                    {
                        var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset);
                        tokensInOriginalOrder[pos[tp]] = token;
                    }
                }
            }
            //If the field has been stored without position data we must perform a sort
            if (unsortedTokens != null)
            {
                tokensInOriginalOrder = unsortedTokens.ToArray();
                Array.Sort(tokensInOriginalOrder, (t1, t2) =>
                                                      {
                                                          if (t1.StartOffset > t2.EndOffset)
                                                              return 1;
                                                          if (t1.StartOffset < t2.StartOffset)
                                                              return -1;
                                                          return 0;
                                                      });
            }
            return new StoredTokenStream(tokensInOriginalOrder);
        }

예제 #10

0

파일 보기

파일: TokenSources.cs 프로젝트: mundher/lucene.net

        /// <summary>
        /// Low level api.
        /// Returns a token stream or null if no offset info available in index.
        /// This can be used to feed the highlighter with a pre-parsed token stream
        ///
        /// In my tests the speeds to recreate 1000 token streams using this method are:
        /// - with TermVector offset only data stored - 420  milliseconds
        /// - with TermVector offset AND position data stored - 271 milliseconds
        ///  (nb timings for TermVector with position data are based on a tokenizer with contiguous
        ///  positions - no overlaps or gaps)
        /// The cost of not using TermPositionVector to store
        /// pre-parsed content and using an analyzer to re-parse the original content:
        /// - reanalyzing the original content - 980 milliseconds
        ///
        /// The re-analyze timings will typically vary depending on -
        ///     1) The complexity of the analyzer code (timings above were using a
        ///        stemmer/lowercaser/stopword combo)
        ///  2) The  number of other fields (Lucene reads ALL fields off the disk
        ///     when accessing just one document field - can cost dear!)
        ///  3) Use of compression on field storage - could be faster due to compression (less disk IO)
        ///     or slower (more CPU burn) depending on the content.
        /// </summary>
        /// <param name="tpv"/>
        /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
        /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param>
        public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)
        {
            //code to reconstruct the original sequence of Tokens
            String[] terms = tpv.GetTerms();
            int[]    freq  = tpv.GetTermFrequencies();

            int totalTokens = freq.Sum();

            var          tokensInOriginalOrder = new Token[totalTokens];
            List <Token> unsortedTokens        = null;

            for (int t = 0; t < freq.Length; t++)
            {
                TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t);
                if (offsets == null)
                {
                    return(null);
                }

                int[] pos = null;
                if (tokenPositionsGuaranteedContiguous)
                {
                    //try get the token position info to speed up assembly of tokens into sorted sequence
                    pos = tpv.GetTermPositions(t);
                }
                if (pos == null)
                {
                    //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
                    if (unsortedTokens == null)
                    {
                        unsortedTokens = new List <Token>();
                    }

                    foreach (TermVectorOffsetInfo t1 in offsets)
                    {
                        var token = new Token(t1.StartOffset, t1.EndOffset);
                        token.SetTermBuffer(terms[t]);
                        unsortedTokens.Add(token);
                    }
                }
                else
                {
                    //We have positions stored and a guarantee that the token position information is contiguous

                    // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
                    // creates jumps in position numbers - this code would fail under those circumstances

                    //tokens stored with positions - can use this to index straight into sorted array
                    for (int tp = 0; tp < pos.Length; tp++)
                    {
                        var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset);
                        tokensInOriginalOrder[pos[tp]] = token;
                    }
                }
            }
            //If the field has been stored without position data we must perform a sort
            if (unsortedTokens != null)
            {
                tokensInOriginalOrder = unsortedTokens.ToArray();
                Array.Sort(tokensInOriginalOrder, (t1, t2) =>
                {
                    if (t1.StartOffset > t2.EndOffset)
                    {
                        return(1);
                    }
                    if (t1.StartOffset < t2.StartOffset)
                    {
                        return(-1);
                    }
                    return(0);
                });
            }
            return(new StoredTokenStream(tokensInOriginalOrder));
        }

예제 #11

0

파일 보기

파일: FieldTermStack.cs 프로젝트: wangzhefeng2000/ravendb

        public FieldTermStack(IndexReader reader, int docId, String fieldName, FieldQuery fieldQuery, IState state)
        {
            this.fieldName = fieldName;

            var tfv = reader.GetTermFreqVector(docId, fieldName, state);

            if (tfv == null)
            {
                return; // just return to make null snippets
            }
            TermPositionVector tpv = null;

            try
            {
                tpv = (TermPositionVector)tfv;
            }
            catch (InvalidCastException)
            {
                return; // just return to make null snippets
            }

            List <String> termSet = fieldQuery.getTermSet(fieldName);

            // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
            if (termSet == null)
            {
                return;
            }
            var needwildcard = termSet.Any(x => x.IndexOfAny(new char[] { '*', '?' }) != -1);

            foreach (String term in tpv.GetTerms())
            {
                if (needwildcard)
                {
                    if (termSet.Any(ts => WildcardMatcher.Matches(ts, term)) == false)
                    {
                        continue;
                    }
                }
                else if (!termSet.Contains(term))
                {
                    continue;
                }

                int index = tpv.IndexOf(term);
                TermVectorOffsetInfo[] tvois = tpv.GetOffsets(index);
                if (tvois == null)
                {
                    return; // just return to make null snippets
                }
                int[] poss = tpv.GetTermPositions(index);
                if (poss == null)
                {
                    return; // just return to make null snippets
                }
                for (int i = 0; i < tvois.Length; i++)
                {
                    termList.AddLast(new TermInfo(term, tvois[i].StartOffset, tvois[i].EndOffset, poss[i]));
                }
            }

            // sort by position
            //Collections.sort(termList);
            Sort(termList);
        }

C# (CSharp) TermPositionVector.GetTerms 예제들