/// <summary>
 /// checks term-level statistics
 /// </summary>
 public virtual void AssertTermStats(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum)
 {
     Assert.AreEqual(leftTermsEnum.DocFreq(), rightTermsEnum.DocFreq());
     if (leftTermsEnum.TotalTermFreq() != -1 && rightTermsEnum.TotalTermFreq() != -1)
     {
         Assert.AreEqual(leftTermsEnum.TotalTermFreq(), rightTermsEnum.TotalTermFreq());
     }
 }
Example #2
0
        public virtual void TestEndOffsetPositionWithTeeSinkTokenFilter()
        {
            Store.Directory    dir         = NewDirectory();
            Analyzer           analyzer    = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false);
            IndexWriter        w           = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
            Document           doc         = new Document();
            TokenStream        tokenStream = analyzer.TokenStream("field", "abcd   ");
            TeeSinkTokenFilter tee         = new TeeSinkTokenFilter(tokenStream);
            TokenStream        sink        = tee.NewSinkTokenStream();
            FieldType          ft          = new FieldType(TextField.TYPE_NOT_STORED);

            ft.StoreTermVectors         = true;
            ft.StoreTermVectorOffsets   = true;
            ft.StoreTermVectorPositions = true;
            Field f1 = new Field("field", tee, ft);
            Field f2 = new Field("field", sink, ft);

            doc.Add(f1);
            doc.Add(f2);
            w.AddDocument(doc);
            w.Dispose();

            IndexReader r      = DirectoryReader.Open(dir);
            Terms       vector = r.GetTermVectors(0).Terms("field");

            assertEquals(1, vector.Size());
            TermsEnum termsEnum = vector.Iterator(null);

            termsEnum.Next();
            assertEquals(2, termsEnum.TotalTermFreq());
            DocsAndPositionsEnum positions = termsEnum.DocsAndPositions(null, null);

            assertTrue(positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            assertEquals(2, positions.Freq());
            positions.NextPosition();
            assertEquals(0, positions.StartOffset());
            assertEquals(4, positions.EndOffset());
            positions.NextPosition();
            assertEquals(8, positions.StartOffset());
            assertEquals(12, positions.EndOffset());
            assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.NextDoc());
            r.Dispose();
            dir.Dispose();
        }
Example #3
0
        /// <summary>
        /// Adds terms and frequencies found in vector into the Map termFreqMap
        /// </summary>
        /// <param name="termFreqMap"> a Map of terms and their frequencies </param>
        /// <param name="vector"> List of terms and their frequencies for a doc/field </param>
        private void AddTermFrequencies(IDictionary <string, Int> termFreqMap, Terms vector)
        {
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final org.apache.lucene.index.TermsEnum termsEnum = vector.iterator(null);
            TermsEnum termsEnum = vector.Iterator(null);
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final org.apache.lucene.util.CharsRef spare = new org.apache.lucene.util.CharsRef();
            CharsRef spare = new CharsRef();
            BytesRef text;

            while ((text = termsEnum.Next()) != null)
            {
                UnicodeUtil.UTF8toUTF16(text, spare);
                //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
                //ORIGINAL LINE: final String term = spare.toString();
                string term = spare.ToString();
                if (IsNoiseWord(term))
                {
                    continue;
                }
                //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
                //ORIGINAL LINE: final int freq = (int) termsEnum.totalTermFreq();
                int freq = (int)termsEnum.TotalTermFreq();

                // increment frequency
                Int cnt = termFreqMap[term];
                if (cnt == null)
                {
                    cnt = new Int();
                    termFreqMap[term] = cnt;
                    cnt.x             = freq;
                }
                else
                {
                    cnt.x += freq;
                }
            }
        }
Example #4
0
        protected void CompareTermVectors(Terms terms, Terms memTerms, string field_name)
        {
            TermsEnum termEnum    = terms.Iterator(null);
            TermsEnum memTermEnum = memTerms.Iterator(null);

            while (termEnum.Next() != null)
            {
                assertNotNull(memTermEnum.Next());

                assertEquals(termEnum.TotalTermFreq(), memTermEnum.TotalTermFreq());

                DocsAndPositionsEnum docsPosEnum    = termEnum.DocsAndPositions(null, null, 0);
                DocsAndPositionsEnum memDocsPosEnum = memTermEnum.DocsAndPositions(null, null, 0);
                String currentTerm = termEnum.Term().Utf8ToString();


                assertEquals("Token mismatch for field: " + field_name, currentTerm, memTermEnum.Term().Utf8ToString());

                docsPosEnum.NextDoc();
                memDocsPosEnum.NextDoc();

                int freq = docsPosEnum.Freq();
                assertEquals(freq, memDocsPosEnum.Freq());
                for (int i = 0; i < freq; i++)
                {
                    string failDesc = " (field:" + field_name + " term:" + currentTerm + ")";
                    int    memPos   = memDocsPosEnum.NextPosition();
                    int    pos      = docsPosEnum.NextPosition();
                    assertEquals("Position test failed" + failDesc, memPos, pos);
                    assertEquals("Start offset test failed" + failDesc, memDocsPosEnum.StartOffset(), docsPosEnum.StartOffset());
                    assertEquals("End offset test failed" + failDesc, memDocsPosEnum.EndOffset(), docsPosEnum.EndOffset());
                    assertEquals("Missing payload test failed" + failDesc, docsPosEnum.Payload, null);
                }
            }
            assertNull("Still some tokens not processed", memTermEnum.Next());
        }
Example #5
0
        public virtual void TestMixedVectrosVectors()
        {
            RandomIndexWriter writer = new RandomIndexWriter(Random(), Directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true)).SetOpenMode(OpenMode.CREATE));
            Document          doc    = new Document();

            FieldType ft2 = new FieldType(TextField.TYPE_STORED);

            ft2.StoreTermVectors = true;

            FieldType ft3 = new FieldType(TextField.TYPE_STORED);

            ft3.StoreTermVectors         = true;
            ft3.StoreTermVectorPositions = true;

            FieldType ft4 = new FieldType(TextField.TYPE_STORED);

            ft4.StoreTermVectors       = true;
            ft4.StoreTermVectorOffsets = true;

            FieldType ft5 = new FieldType(TextField.TYPE_STORED);

            ft5.StoreTermVectors         = true;
            ft5.StoreTermVectorOffsets   = true;
            ft5.StoreTermVectorPositions = true;

            doc.Add(NewTextField("field", "one", Field.Store.YES));
            doc.Add(NewField("field", "one", ft2));
            doc.Add(NewField("field", "one", ft3));
            doc.Add(NewField("field", "one", ft4));
            doc.Add(NewField("field", "one", ft5));
            writer.AddDocument(doc);
            IndexReader reader = writer.Reader;

            writer.Dispose();

            IndexSearcher searcher = NewSearcher(reader);

            Query query = new TermQuery(new Term("field", "one"));

            ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
            Assert.AreEqual(1, hits.Length);

            Fields vectors = searcher.IndexReader.GetTermVectors(hits[0].Doc);

            Assert.IsNotNull(vectors);
            Assert.AreEqual(1, vectors.Size);
            Terms vector = vectors.Terms("field");

            Assert.IsNotNull(vector);
            Assert.AreEqual(1, vector.Size());
            TermsEnum termsEnum = vector.Iterator(null);

            Assert.IsNotNull(termsEnum.Next());
            Assert.AreEqual("one", termsEnum.Term().Utf8ToString());
            Assert.AreEqual(5, termsEnum.TotalTermFreq());
            DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null);

            Assert.IsNotNull(dpEnum);
            Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            Assert.AreEqual(5, dpEnum.Freq());
            for (int i = 0; i < 5; i++)
            {
                Assert.AreEqual(i, dpEnum.NextPosition());
            }

            dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
            Assert.IsNotNull(dpEnum);
            Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            Assert.AreEqual(5, dpEnum.Freq());
            for (int i = 0; i < 5; i++)
            {
                dpEnum.NextPosition();
                Assert.AreEqual(4 * i, dpEnum.StartOffset());
                Assert.AreEqual(4 * i + 3, dpEnum.EndOffset());
            }
            reader.Dispose();
        }
Example #6
0
        public virtual void CollectTermContext(IndexReader reader, IList <AtomicReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms)
        {
            TermsEnum termsEnum = null;

            foreach (AtomicReaderContext context in leaves)
            {
                Fields fields = context.AtomicReader.Fields;
                if (fields == null)
                {
                    // reader has no fields
                    continue;
                }
                for (int i = 0; i < queryTerms.Length; i++)
                {
                    Term        term        = queryTerms[i];
                    TermContext termContext = contextArray[i];
                    //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
                    //ORIGINAL LINE: final org.apache.lucene.index.Terms terms = fields.terms(term.field());
                    Terms terms = fields.Terms(term.Field());
                    if (terms == null)
                    {
                        // field does not exist
                        continue;
                    }
                    termsEnum = terms.Iterator(termsEnum);
                    Debug.Assert(termsEnum != null);

                    if (termsEnum == TermsEnum.EMPTY)
                    {
                        continue;
                    }
                    if (termsEnum.SeekExact(term.Bytes()))
                    {
                        if (termContext == null)
                        {
                            contextArray[i] = new TermContext(reader.Context, termsEnum.TermState(), context.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq());
                        }
                        else
                        {
                            termContext.Register(termsEnum.TermState(), context.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq());
                        }
                    }
                }
            }
        }
Example #7
0
            public override bool Collect(BytesRef bytes)
            {
                float boost = boostAtt.Boost;

                // make sure within a single seg we always collect
                // terms in order
                Debug.Assert(CompareToLastTerm(bytes));

                //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord);
                // ignore uncompetitive hits
                if (StQueue.Size() == MaxSize)
                {
                    ScoreTerm t = StQueue.Top();
                    if (boost < t.Boost)
                    {
                        return(true);
                    }
                    if (boost == t.Boost && termComp.Compare(bytes, t.Bytes) > 0)
                    {
                        return(true);
                    }
                }
                ScoreTerm t2;
                TermState state = termsEnum.TermState();

                Debug.Assert(state != null);
                if (visitedTerms.TryGetValue(bytes, out t2))
                {
                    // if the term is already in the PQ, only update docFreq of term in PQ
                    Debug.Assert(t2.Boost == boost, "boost should be equal in all segment TermsEnums");
                    t2.TermState.Register(state, ReaderContext.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq());
                }
                else
                {
                    // add new entry in PQ, we must clone the term, else it may get overwritten!
                    st.Bytes.CopyBytes(bytes);
                    st.Boost = boost;
                    visitedTerms[st.Bytes] = st;
                    Debug.Assert(st.TermState.DocFreq == 0);
                    st.TermState.Register(state, ReaderContext.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq());
                    StQueue.Add(st);
                    // possibly drop entries from queue
                    if (StQueue.Size() > MaxSize)
                    {
                        st = StQueue.Pop();
                        visitedTerms.Remove(st.Bytes);
                        st.TermState.Clear(); // reset the termstate!
                    }
                    else
                    {
                        st = new ScoreTerm(termComp, new TermContext(TopReaderContext));
                    }
                    Debug.Assert(StQueue.Size() <= MaxSize, "the PQ size must be limited to maxSize");
                    // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
                    if (StQueue.Size() == MaxSize)
                    {
                        t2 = StQueue.Top();
                        maxBoostAtt.MaxNonCompetitiveBoost = t2.Boost;
                        maxBoostAtt.CompetitiveTerm        = t2.Bytes;
                    }
                }

                return(true);
            }
Example #8
0
            internal void Fill(string field, TermsEnum termsEnum)
            {
                BytesRef term = null;

                while ((term = termsEnum.Next()) != null)
                {
                    InsertWithOverflow(new TermStats(field, term, termsEnum.DocFreq(), termsEnum.TotalTermFreq()));
                }
            }
Example #9
0
        /// <summary>
        /// Build the suggest index, using up to the specified
        ///  amount of temporary RAM while building.  Note that
        ///  the weights for the suggestions are ignored.
        /// </summary>
        public virtual void Build(InputIterator iterator, double ramBufferSizeMB)
        {
            if (iterator.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (iterator.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            string prefix    = this.GetType().Name;
            var    directory = OfflineSorter.DefaultTempDir();
            // TODO: messy ... java7 has Files.createTempDirectory
            // ... but 4.x is java6:
            File   tempIndexPath = null;
            Random random        = new Random();

            while (true)
            {
                tempIndexPath = new File(directory, prefix + ".index." + random.Next(int.MaxValue));
                if (tempIndexPath.mkdir())
                {
                    break;
                }
            }

            Directory dir = FSDirectory.Open(tempIndexPath);

            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_CURRENT, indexAnalyzer);

            iwc.OpenMode        = IndexWriterConfig.OpenMode_e.CREATE;
            iwc.RAMBufferSizeMB = ramBufferSizeMB;
            IndexWriter writer = new IndexWriter(dir, iwc);

            var ft = new FieldType(TextField.TYPE_NOT_STORED);

            // TODO: if only we had IndexOptions.TERMS_ONLY...
            ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS;
            ft.OmitNorms    = true;
            ft.Freeze();

            Document doc   = new Document();
            Field    field = new Field("body", "", ft);

            doc.Add(field);

            totTokens = 0;
            IndexReader reader = null;

            bool success = false;

            count = 0;
            try
            {
                while (true)
                {
                    BytesRef surfaceForm = iterator.Next();
                    if (surfaceForm == null)
                    {
                        break;
                    }
                    field.StringValue = surfaceForm.Utf8ToString();
                    writer.AddDocument(doc);
                    count++;
                }
                reader = DirectoryReader.Open(writer, false);

                Terms terms = MultiFields.GetTerms(reader, "body");
                if (terms == null)
                {
                    throw new System.ArgumentException("need at least one suggestion");
                }

                // Move all ngrams into an FST:
                TermsEnum termsEnum = terms.Iterator(null);

                Outputs <long?> outputs = PositiveIntOutputs.Singleton;
                Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs);

                IntsRef scratchInts = new IntsRef();
                while (true)
                {
                    BytesRef term = termsEnum.Next();
                    if (term == null)
                    {
                        break;
                    }
                    int ngramCount = CountGrams(term);
                    if (ngramCount > grams)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
                    }
                    if (ngramCount == 1)
                    {
                        totTokens += termsEnum.TotalTermFreq();
                    }

                    builder.Add(Lucene.Net.Util.Fst.Util.ToIntsRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq()));
                }

                fst = builder.Finish();
                if (fst == null)
                {
                    throw new System.ArgumentException("need at least one suggestion");
                }
                //System.out.println("FST: " + fst.getNodeCount() + " nodes");

                /*
                 * PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
                 * Util.toDot(fst, pw, true, true);
                 * pw.close();
                 */

                success = true;
            }
            finally
            {
                try
                {
                    if (success)
                    {
                        IOUtils.Close(writer, reader);
                    }
                    else
                    {
                        IOUtils.CloseWhileHandlingException(writer, reader);
                    }
                }
                finally
                {
                    foreach (string file in dir.ListAll())
                    {
                        File path = new File(tempIndexPath, file);
                        if (path.Delete() == false)
                        {
                            throw new InvalidOperationException("failed to remove " + path);
                        }
                    }

                    if (tempIndexPath.Delete() == false)
                    {
                        throw new InvalidOperationException("failed to remove " + tempIndexPath);
                    }

                    dir.Dispose();
                }
            }
        }
Example #10
0
        public virtual void CollectTermContext(IndexReader reader, IList <AtomicReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms)
        {
            TermsEnum termsEnum = null;

            foreach (AtomicReaderContext context in leaves)
            {
                Fields fields = context.AtomicReader.Fields;
                if (fields == null)
                {
                    // reader has no fields
                    continue;
                }
                for (int i = 0; i < queryTerms.Length; i++)
                {
                    Term        term        = queryTerms[i];
                    TermContext termContext = contextArray[i];
                    Terms       terms       = fields.Terms(term.Field);
                    if (terms == null)
                    {
                        // field does not exist
                        continue;
                    }
                    termsEnum = terms.Iterator(termsEnum);
                    Debug.Assert(termsEnum != null);

                    if (termsEnum == TermsEnum.EMPTY)
                    {
                        continue;
                    }
                    if (termsEnum.SeekExact(term.Bytes))
                    {
                        if (termContext == null)
                        {
                            contextArray[i] = new TermContext(reader.Context, termsEnum.TermState(), context.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq());
                        }
                        else
                        {
                            termContext.Register(termsEnum.TermState(), context.Ord, termsEnum.DocFreq(), termsEnum.TotalTermFreq());
                        }
                    }
                }
            }
        }
Example #11
0
        /// <summary>
        /// Safe (but, slowish) default method to write every
        ///  vector field in the document.
        /// </summary>
        protected internal void AddAllDocVectors(Fields vectors, MergeState mergeState)
        {
            if (vectors == null)
            {
                StartDocument(0);
                FinishDocument();
                return;
            }

            int numFields = vectors.Size;

            if (numFields == -1)
            {
                // count manually! TODO: Maybe enforce that Fields.size() returns something valid?
                numFields = 0;
                //for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();)
                foreach (string it in vectors)
                {
                    numFields++;
                }
            }
            StartDocument(numFields);

            string lastFieldName = null;

            TermsEnum            termsEnum            = null;
            DocsAndPositionsEnum docsAndPositionsEnum = null;

            int fieldCount = 0;

            foreach (string fieldName in vectors)
            {
                fieldCount++;
                FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName);

                Debug.Assert(lastFieldName == null || fieldName.CompareTo(lastFieldName) > 0, "lastFieldName=" + lastFieldName + " fieldName=" + fieldName);
                lastFieldName = fieldName;

                Terms terms = vectors.Terms(fieldName);
                if (terms == null)
                {
                    // FieldsEnum shouldn't lie...
                    continue;
                }

                bool hasPositions = terms.HasPositions();
                bool hasOffsets   = terms.HasOffsets();
                bool hasPayloads  = terms.HasPayloads();
                Debug.Assert(!hasPayloads || hasPositions);

                int numTerms = (int)terms.Size();
                if (numTerms == -1)
                {
                    // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
                    numTerms  = 0;
                    termsEnum = terms.Iterator(termsEnum);
                    while (termsEnum.Next() != null)
                    {
                        numTerms++;
                    }
                }

                StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
                termsEnum = terms.Iterator(termsEnum);

                int termCount = 0;
                while (termsEnum.Next() != null)
                {
                    termCount++;

                    int freq = (int)termsEnum.TotalTermFreq();

                    StartTerm(termsEnum.Term(), freq);

                    if (hasPositions || hasOffsets)
                    {
                        docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum);
                        Debug.Assert(docsAndPositionsEnum != null);

                        int docID = docsAndPositionsEnum.NextDoc();
                        Debug.Assert(docID != DocIdSetIterator.NO_MORE_DOCS);
                        Debug.Assert(docsAndPositionsEnum.Freq() == freq);

                        for (int posUpto = 0; posUpto < freq; posUpto++)
                        {
                            int pos         = docsAndPositionsEnum.NextPosition();
                            int startOffset = docsAndPositionsEnum.StartOffset();
                            int endOffset   = docsAndPositionsEnum.EndOffset();

                            BytesRef payload = docsAndPositionsEnum.Payload;

                            Debug.Assert(!hasPositions || pos >= 0);
                            AddPosition(pos, startOffset, endOffset, payload);
                        }
                    }
                    FinishTerm();
                }
                Debug.Assert(termCount == numTerms);
                FinishField();
            }
            Debug.Assert(fieldCount == numFields);
            FinishDocument();
        }
Example #12
0
            public override bool Collect(BytesRef bytes)
            {
                int       e     = Terms.Add(bytes);
                TermState state = TermsEnum.TermState();

                Debug.Assert(state != null);
                if (e < 0)
                {
                    // duplicate term: update docFreq
                    int pos = (-e) - 1;
                    Array.TermState[pos].Register(state, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq());
                    Debug.Assert(Array.Boost[pos] == BoostAtt.Boost, "boost should be equal in all segment TermsEnums");
                }
                else
                {
                    // new entry: we populate the entry initially
                    Array.Boost[e]     = BoostAtt.Boost;
                    Array.TermState[e] = new TermContext(TopReaderContext, state, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq());
                    OuterInstance.CheckMaxClauseCount(Terms.Size());
                }
                return(true);
            }
Example #13
0
            public override bool Collect(BytesRef bytes)
            {
                int pos = PendingTerms.Add(bytes);

                DocVisitCount += TermsEnum.DocFreq();
                if (PendingTerms.Size() >= TermCountLimit || DocVisitCount >= DocCountCutoff)
                {
                    HasCutOff = true;
                    return(false);
                }

                TermState termState = TermsEnum.TermState();

                Debug.Assert(termState != null);
                if (pos < 0)
                {
                    pos = (-pos) - 1;
                    Array.TermState[pos].Register(termState, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq());
                }
                else
                {
                    Array.TermState[pos] = new TermContext(TopReaderContext, termState, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq());
                }
                return(true);
            }