Example #1
0
        public override BitArray Bits(IndexReader reader)
        {
            // reader.GetVersion could be used to cache
            // Debug.WriteLine(reader.GetVersion()); // could be used to cache
            // if (cached reader == reader && _revFirst ==

            if (_revFirst == All || _revLast == All) // optimization
            {
                return(new BitArray(reader.MaxDoc(), true));
            }

            BitArray last_bits = new BitArray(reader.MaxDoc(), false);

            TermEnum t = reader.Terms(new Term(FieldName.RevisionLast, _revFirst.ToString(RevFormat)));
            TermDocs d = reader.TermDocs();

            //if (t.SkipTo((new Term(FieldName.RevisionLast, revision.ToString(RevFormat))))) // extremely slow
            if (t.Term() != null)
            {
                while (t.Term().Field() == FieldName.RevisionLast)
                {
                    d.Seek(t);
                    while (d.Next())
                    {
                        last_bits[d.Doc()] = true;
                    }
                    if (!t.Next())
                    {
                        break;
                    }
                }
            }

            // optimization, skip if we just using the head revision
            if (_revLast == Head)
            {
                return(last_bits);
            }

            BitArray first_bits = new BitArray(reader.MaxDoc(), true);

            t = reader.Terms(new Term("rev_first", (_revLast + 1).ToString(RevFormat)));
            //if (t.SkipTo((new Term("rev_first", (revision + 1).ToString(RevFormat))))) // extremely slow
            if (t.Term() != null)
            {
                while (t.Term().Field() == "rev_first")
                {
                    d.Seek(t);
                    while (d.Next())
                    {
                        first_bits[d.Doc()] = false;
                    }
                    if (!t.Next())
                    {
                        break;
                    }
                }
            }
            return(last_bits.And(first_bits));
        }
        public virtual void  TestTerms()
        {
            TermEnum terms = _reader.Terms();

            Assert.IsTrue(terms != null);
            while (terms.Next() == true)
            {
                Term term = terms.Term;
                Assert.IsTrue(term != null);
                //System.out.println("Term: " + term);
                System.String fieldValue = (System.String)DocHelper.NameValues[term.Field];
                Assert.IsTrue(fieldValue.IndexOf(term.Text) != -1);
            }

            TermDocs termDocs = _reader.TermDocs();

            Assert.IsTrue(termDocs != null);
            termDocs.Seek(new Term(DocHelper.TextField1Key, "field"));
            Assert.IsTrue(termDocs.Next() == true);

            termDocs.Seek(new Term(DocHelper.NoNormsKey, DocHelper.NoNormsText));
            Assert.IsTrue(termDocs.Next() == true);


            TermPositions positions = _reader.TermPositions();

            positions.Seek(new Term(DocHelper.TextField1Key, "field"));
            Assert.IsTrue(positions != null);
            Assert.IsTrue(positions.Doc == 0);
            Assert.IsTrue(positions.NextPosition() >= 0);
        }
Example #3
0
        public virtual void  TestTerms()
        {
            TermEnum terms = reader.Terms();

            Assert.IsTrue(terms != null);
            while (terms.Next() == true)
            {
                Term term = terms.Term();
                Assert.IsTrue(term != null);
                //System.out.println("Term: " + term);
                System.String fieldValue = (System.String)DocHelper.nameValues[term.Field()];
                Assert.IsTrue(fieldValue.IndexOf(term.Text()) != -1);
            }

            TermDocs termDocs = reader.TermDocs();

            Assert.IsTrue(termDocs != null);
            termDocs.Seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field"));
            Assert.IsTrue(termDocs.Next() == true);

            termDocs.Seek(new Term(DocHelper.NO_NORMS_KEY, DocHelper.NO_NORMS_TEXT));
            Assert.IsTrue(termDocs.Next() == true);


            TermPositions positions = reader.TermPositions();

            positions.Seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field"));
            Assert.IsTrue(positions != null);
            Assert.IsTrue(positions.Doc() == 0);
            Assert.IsTrue(positions.NextPosition() >= 0);
        }
Example #4
0
            protected internal override System.Object CreateValue(IndexReader reader, Entry entryKey)
            {
                System.String   field    = StringHelper.Intern(entryKey.field);
                System.String[] retArray = new System.String[reader.MaxDoc];
                TermDocs        termDocs = reader.TermDocs();
                TermEnum        termEnum = reader.Terms(new Term(field));

                try
                {
                    do
                    {
                        Term term = termEnum.Term;
                        if (term == null || (System.Object)term.Field != (System.Object)field)
                        {
                            break;
                        }
                        System.String termval = term.Text;
                        termDocs.Seek(termEnum);
                        while (termDocs.Next())
                        {
                            retArray[termDocs.Doc] = termval;
                        }
                    }while (termEnum.Next());
                }
                finally
                {
                    termDocs.Close();
                    termEnum.Close();
                }
                return(retArray);
            }
        private static Hit CreateHit(Document primary_doc,
                                     IndexReader secondary_reader,
                                     TermDocs term_docs,
                                     FieldSelector fields)
        {
            Hit hit = DocumentToHit(primary_doc);

            if (secondary_reader == null)
            {
                return(hit);
            }

            // Get the stringified version of the URI
            // exactly as it comes out of the index.
            Term term = new Term("Uri", primary_doc.Get("Uri"));

            term_docs.Seek(term);

            // Move to the first (and only) matching term doc
            term_docs.Next();
            Document secondary_doc =
                (fields == null) ?
                secondary_reader.Document(term_docs.Doc()) :
                secondary_reader.Document(term_docs.Doc(), fields);

            // If we are using the secondary index, now we need to
            // merge the properties from the secondary index
            AddPropertiesToHit(hit, secondary_doc, false);

            return(hit);
        }
Example #6
0
        public virtual void  TestTerms()
        {
            try
            {
                TermEnum terms = reader.Terms();
                Assert.IsTrue(terms != null);
                while (terms.Next() == true)
                {
                    Term term = terms.Term();
                    Assert.IsTrue(term != null);
                    //System.out.println("Term: " + term);
                    System.String fieldValue = (System.String)DocHelper.nameValues[term.Field()];
                    Assert.IsTrue(fieldValue.IndexOf(term.Text()) != -1);
                }

                TermDocs termDocs = reader.TermDocs();
                Assert.IsTrue(termDocs != null);
                termDocs.Seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "Field"));
                Assert.IsTrue(termDocs.Next() == true);

                TermPositions positions = reader.TermPositions();
                positions.Seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "Field"));
                Assert.IsTrue(positions != null);
                Assert.IsTrue(positions.Doc() == 0);
                Assert.IsTrue(positions.NextPosition() >= 0);
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }
        public virtual int doTest(int iter, int ndocs, int maxTF, float percentDocs)
        {
            Directory dir = new RAMDirectory();

            long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);

            AddDocs(dir, ndocs, "foo", "val", maxTF, percentDocs);
            long end = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);

            System.Console.Out.WriteLine("milliseconds for creation of " + ndocs + " docs = " + (end - start));

            IndexReader reader = IndexReader.Open(dir);
            TermEnum    tenum  = reader.Terms(new Term("foo", "val"));
            TermDocs    tdocs  = reader.TermDocs();

            start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);

            int ret = 0;

            for (int i = 0; i < iter; i++)
            {
                tdocs.Seek(tenum);
                while (tdocs.Next())
                {
                    ret += tdocs.Doc();
                }
            }

            end = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
            System.Console.Out.WriteLine("milliseconds for " + iter + " TermDocs iteration: " + (end - start));

            return(ret);
        }
Example #8
0
        /// <summary>Returns an enumeration of all the documents which contain
        /// <code>term</code>. For each document, the document number, the frequency of
        /// the term in that document is also provided, for use in search scoring.
        /// Thus, this method implements the mapping:
        /// <p><ul>
        /// Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq&gt;<sup>*</sup>
        /// </ul>
        /// <p>The enumeration is ordered by document number.  Each document number
        /// is greater than all that precede it in the enumeration.
        /// </summary>
        public virtual TermDocs TermDocs(Term term)
        {
            TermDocs termDocs = TermDocs();

            termDocs.Seek(term);
            return(termDocs);
        }
        public override DocIdSet GetDocIdSet(IndexReader reader)
        {
            TermEnum enumerator = query.GetEnum(reader);

            try
            {
                // if current term in enum is null, the enum is empty -> shortcut
                if (enumerator.Term == null)
                {
                    return(DocIdSet.EMPTY_DOCIDSET);
                }
                // else fill into an OpenBitSet
                OpenBitSet bitSet   = new OpenBitSet(reader.MaxDoc);
                int[]      docs     = new int[32];
                int[]      freqs    = new int[32];
                TermDocs   termDocs = reader.TermDocs();
                try
                {
                    int termCount = 0;
                    do
                    {
                        Term term = enumerator.Term;
                        if (term == null)
                        {
                            break;
                        }
                        termCount++;
                        termDocs.Seek(term);
                        while (true)
                        {
                            int count = termDocs.Read(docs, freqs);
                            if (count != 0)
                            {
                                for (int i = 0; i < count; i++)
                                {
                                    bitSet.Set(docs[i]);
                                }
                            }
                            else
                            {
                                break;
                            }
                        }
                    } while (enumerator.Next());

                    query.IncTotalNumberOfTerms(termCount); // {{Aroush-2.9}} is the use of 'temp' as is right?
                }
                finally
                {
                    termDocs.Close();
                }

                return(bitSet);
            }
            finally
            {
                enumerator.Close();
            }
        }
Example #10
0
            protected internal override System.Object CreateValue(IndexReader reader, Entry entryKey)
            {
                Entry entry = entryKey;

                System.String field  = entry.field;
                FloatParser   parser = (FloatParser)entry.custom;

                if (parser == null)
                {
                    try
                    {
                        return(wrapper.GetFloats(reader, field, Lucene.Net.Search.FieldCache_Fields.DEFAULT_FLOAT_PARSER));
                    }
                    catch (System.FormatException)
                    {
                        return(wrapper.GetFloats(reader, field, Lucene.Net.Search.FieldCache_Fields.NUMERIC_UTILS_FLOAT_PARSER));
                    }
                }
                float[]  retArray = null;
                TermDocs termDocs = reader.TermDocs();
                TermEnum termEnum = reader.Terms(new Term(field));

                try
                {
                    do
                    {
                        Term term = termEnum.Term;
                        if (term == null || (System.Object)term.Field != (System.Object)field)
                        {
                            break;
                        }
                        float termval = parser.ParseFloat(term.Text);
                        if (retArray == null)
                        {
                            // late init
                            retArray = new float[reader.MaxDoc];
                        }
                        termDocs.Seek(termEnum);
                        while (termDocs.Next())
                        {
                            retArray[termDocs.Doc] = termval;
                        }
                    }while (termEnum.Next());
                }
                catch (StopFillCacheException)
                {
                }
                finally
                {
                    termDocs.Close();
                    termEnum.Close();
                }
                if (retArray == null)
                {
                    // no values
                    retArray = new float[reader.MaxDoc];
                }
                return(retArray);
            }
Example #11
0
        /// <summary>Returns an enumeration of all the documents which contain
        /// <code>term</code>. For each document, the document number, the frequency of
        /// the term in that document is also provided, for use in search scoring.
        /// Thus, this method implements the mapping:
        /// <p><ul>
        /// Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq&gt;<sup>*</sup>
        /// </ul>
        /// <p>The enumeration is ordered by document number.  Each document number
        /// is greater than all that precede it in the enumeration.
        /// </summary>
        /// <throws>  IOException if there is a low-level IO error </throws>
        public virtual TermDocs TermDocs(Term term)
        {
            EnsureOpen();
            TermDocs termDocs = TermDocs();

            termDocs.Seek(term);
            return(termDocs);
        }
Example #12
0
        public virtual void TestMultiTermDocs()
        {
            SqlServerDirectory.ProvisionDatabase(Connection, "test1", true);
            SqlServerDirectory.ProvisionDatabase(Connection, "test2", true);
            SqlServerDirectory.ProvisionDatabase(Connection, "test3", true);

            var ramDir1 = new SqlServerDirectory(Connection, new Options()
            {
                SchemaName = "test1"
            });

            AddDoc(ramDir1, "test foo", true);
            var ramDir2 = new SqlServerDirectory(Connection, new Options()
            {
                SchemaName = "test2"
            });

            AddDoc(ramDir2, "test blah", true);
            var ramDir3 = new SqlServerDirectory(Connection, new Options()
            {
                SchemaName = "test3"
            });

            AddDoc(ramDir3, "test wow", true);

            IndexReader[] readers1 = new[] { IndexReader.Open(ramDir1, false), IndexReader.Open(ramDir3, false) };
            IndexReader[] readers2 = new[] { IndexReader.Open(ramDir1, false), IndexReader.Open(ramDir2, false), IndexReader.Open(ramDir3, false) };
            MultiReader   mr2      = new MultiReader(readers1);
            MultiReader   mr3      = new MultiReader(readers2);

            // test mixing up TermDocs and TermEnums from different readers.
            TermDocs td2 = mr2.TermDocs();
            TermEnum te3 = mr3.Terms(new Term("body", "wow"));

            td2.Seek(te3);
            int ret = 0;

            // This should blow up if we forget to check that the TermEnum is from the same
            // reader as the TermDocs.
            while (td2.Next())
            {
                ret += td2.Doc;
            }
            td2.Close();
            te3.Close();

            // really a dummy assert to ensure that we got some docs and to ensure that
            // nothing is optimized out.
            Assert.IsTrue(ret > 0);
        }
Example #13
0
        private TermDocs TermDocs(int i)
        {
            if (term == null)
            {
                return(null);
            }
            TermDocs result = readerTermDocs[i];

            if (result == null)
            {
                result = readerTermDocs[i] = TermDocs(readers[i]);
            }
            result.Seek(term);
            return(result);
        }
        // ДОРАБОТКА использовать данную функцию для фильтрации результатов поиска
        public int GetMatchWordCount(IEnumerable <SampleDataFileRow> listFoundDocs, string searchTerm)
        {
            int         totalFreq = 0;
            IndexReader reader    = IndexReader.Open(DirectoryFs, true);

            TermDocs termDocs = reader.TermDocs();

            termDocs.Seek(new Term("LineText", searchTerm));
            foreach (SampleDataFileRow singleRow in listFoundDocs)
            {
                termDocs.SkipTo(singleRow.LineNumber);
                totalFreq += termDocs.Freq;
            }

            return(totalFreq);
        }
Example #15
0
            protected internal override System.Object CreateValue(IndexReader reader, Entry entryKey)
            {
                Entry entry = entryKey;

                System.String field  = entry.field;
                ShortParser   parser = (ShortParser)entry.custom;

                if (parser == null)
                {
                    return(wrapper.GetShorts(reader, field, Lucene.Net.Search.FieldCache_Fields.DEFAULT_SHORT_PARSER));
                }
                short[]  retArray = new short[reader.MaxDoc];
                TermDocs termDocs = reader.TermDocs();
                TermEnum termEnum = reader.Terms(new Term(field));

                try
                {
                    do
                    {
                        Term term = termEnum.Term;
                        if (term == null || (System.Object)term.Field != (System.Object)field)
                        {
                            break;
                        }
                        short termval = parser.ParseShort(term.Text);
                        termDocs.Seek(termEnum);
                        while (termDocs.Next())
                        {
                            retArray[termDocs.Doc] = termval;
                        }
                    }while (termEnum.Next());
                }
                catch (StopFillCacheException)
                {
                }
                finally
                {
                    termDocs.Close();
                    termEnum.Close();
                }
                return(retArray);
            }
Example #16
0
        /// <summary>
        /// Get the DocIdSet.
        /// </summary>
        /// <param name="reader">Applcible reader.</param>
        /// <returns>The set.</returns>
        public override DocIdSet GetDocIdSet(IndexReader reader)
        {
            OpenBitSet result = new OpenBitSet(reader.MaxDoc);
            TermDocs   td     = reader.TermDocs();

            try
            {
                foreach (Term t in this.terms)
                {
                    td.Seek(t);
                    while (td.Next())
                    {
                        result.Set(td.Doc);
                    }
                }
            }
            finally
            {
                td.Close();
            }

            return(result);
        }
        public virtual void  TestMultiTermDocs()
        {
            RAMDirectory ramDir1 = new RAMDirectory();

            AddDoc(ramDir1, "test foo", true);
            RAMDirectory ramDir2 = new RAMDirectory();

            AddDoc(ramDir2, "test blah", true);
            RAMDirectory ramDir3 = new RAMDirectory();

            AddDoc(ramDir3, "test wow", true);

            IndexReader[] readers1 = new IndexReader[] { IndexReader.Open(ramDir1), IndexReader.Open(ramDir3) };
            IndexReader[] readers2 = new IndexReader[] { IndexReader.Open(ramDir1), IndexReader.Open(ramDir2), IndexReader.Open(ramDir3) };
            MultiReader   mr2      = new MultiReader(readers1);
            MultiReader   mr3      = new MultiReader(readers2);

            // test mixing up TermDocs and TermEnums from different readers.
            TermDocs td2 = mr2.TermDocs();
            TermEnum te3 = mr3.Terms(new Term("body", "wow"));

            td2.Seek(te3);
            int ret = 0;

            // This should blow up if we forget to check that the TermEnum is from the same
            // reader as the TermDocs.
            while (td2.Next())
            {
                ret += td2.Doc();
            }
            td2.Close();
            te3.Close();

            // really a dummy assert to ensure that we got some docs and to ensure that
            // nothing is optimized out.
            Assert.IsTrue(ret > 0);
        }
Example #18
0
        public void FlushUris()
        {
            if (pending_uris == null)
            {
                return;
            }

            TermDocs term_docs = this.searcher.Reader.TermDocs();

            for (int i = 0; i < pending_uris.Count; i++)
            {
                Term term = new Term("Uri", (string)pending_uris [i]);
                term_docs.Seek(term);

                if (term_docs.Next())
                {
                    this.Set(term_docs.Doc(), true);
                }
            }

            term_docs.Close();

            pending_uris = null;
        }
Example #19
0
        public override DocIdSet GetDocIdSet(IndexReader reader)
        {
            var bits = new OpenBitSet(reader.MaxDoc());

            TermDocs      termDocs = reader.TermDocs();
            List <double> area     = _shape.Area;
            int           sz       = area.Count;

            // iterate through each boxid
            for (int i = 0; i < sz; i++)
            {
                double boxId = area[i];
                termDocs.Seek(new Term(_fieldName, NumericUtils.DoubleToPrefixCoded(boxId)));

                // iterate through all documents
                // which have this boxId
                while (termDocs.Next())
                {
                    bits.FastSet(termDocs.Doc());
                }
            }

            return(bits);
        }
        ////////////////////////////////////////////////////////////////

        static private void ScoreHits(Dictionary <int, Hit> hits_by_id,
                                      IndexReader reader,
                                      ICollection term_list)
        {
            LNS.Similarity similarity;
            similarity = LNS.Similarity.GetDefault();

            TermDocs term_docs = reader.TermDocs();
            Hit      hit;

            foreach (Term term in term_list)
            {
                double idf;
                idf = similarity.Idf(reader.DocFreq(term), reader.MaxDoc());

                int hit_count;
                hit_count = hits_by_id.Count;

                term_docs.Seek(term);
                while (term_docs.Next() && hit_count > 0)
                {
                    int id;
                    id = term_docs.Doc();

                    if (hits_by_id.TryGetValue(id, out hit))
                    {
                        double tf;
                        tf         = similarity.Tf(term_docs.Freq());
                        hit.Score += tf * idf;
                        --hit_count;
                    }
                }
            }

            term_docs.Close();
        }
Example #21
0
 public virtual void  Seek(Term term)
 {
     in_Renamed.Seek(term);
 }
        public virtual void  Test1()
        {
            ParallelReader pr = new ParallelReader();

            pr.Add(ir1);
            pr.Add(ir2);

            TermDocs td = pr.TermDocs();

            TermEnum te = pr.Terms();

            Assert.IsTrue(te.Next());
            Assert.AreEqual("field1:brown", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field1:fox", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field1:jumps", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field1:quick", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field1:the", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field2:brown", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field2:fox", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field2:jumps", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field2:quick", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field2:the", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field3:dog", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field3:fox", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field3:jumps", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field3:lazy", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field3:over", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsTrue(te.Next());
            Assert.AreEqual("field3:the", te.Term().ToString());
            td.Seek(te.Term());
            Assert.IsTrue(td.Next());
            Assert.AreEqual(0, td.Doc());
            Assert.IsFalse(td.Next());
            Assert.IsFalse(te.Next());
        }
Example #23
0
        public virtual void  TestKnownSetOfDocuments()
        {
            System.String[] termArray = new System.String[] { "eating", "chocolate", "in", "a", "computer", "lab", "grows", "old", "colored", "with", "an" };
            System.String   test1     = "eating chocolate in a computer lab";                                             //6 terms
            System.String   test2     = "computer in a computer lab";                                                     //5 terms
            System.String   test3     = "a chocolate lab grows old";                                                      //5 terms
            System.String   test4     = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
            System.Collections.IDictionary test4Map = new System.Collections.Hashtable();
            test4Map["chocolate"] = 3;
            test4Map["lab"]       = 2;
            test4Map["eating"]    = 1;
            test4Map["computer"]  = 1;
            test4Map["with"]      = 1;
            test4Map["a"]         = 1;
            test4Map["colored"]   = 1;
            test4Map["in"]        = 1;
            test4Map["an"]        = 1;
            test4Map["computer"]  = 1;
            test4Map["old"]       = 1;

            Document testDoc1 = new Document();

            SetupDoc(testDoc1, test1);
            Document testDoc2 = new Document();

            SetupDoc(testDoc2, test2);
            Document testDoc3 = new Document();

            SetupDoc(testDoc3, test3);
            Document testDoc4 = new Document();

            SetupDoc(testDoc4, test4);

            Directory dir = new RAMDirectory();

            try
            {
                IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
                Assert.IsTrue(writer != null);
                writer.AddDocument(testDoc1);
                writer.AddDocument(testDoc2);
                writer.AddDocument(testDoc3);
                writer.AddDocument(testDoc4);
                writer.Close();
                IndexSearcher knownSearcher = new IndexSearcher(dir);
                TermEnum      termEnum      = knownSearcher.reader.Terms();
                TermDocs      termDocs      = knownSearcher.reader.TermDocs();
                //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);

                Similarity sim = knownSearcher.GetSimilarity();
                while (termEnum.Next() == true)
                {
                    Term term = termEnum.Term();
                    //System.out.println("Term: " + term);
                    termDocs.Seek(term);
                    while (termDocs.Next())
                    {
                        int docId = termDocs.Doc();
                        int freq  = termDocs.Freq();
                        //System.out.println("Doc Id: " + docId + " freq " + freq);
                        TermFreqVector vector = knownSearcher.reader.GetTermFreqVector(docId, "Field");
                        float          tf     = sim.Tf(freq);
                        float          idf    = sim.Idf(term, knownSearcher);
                        //float qNorm = sim.queryNorm()
                        //This is fine since we don't have stop words
                        float lNorm = sim.LengthNorm("Field", vector.GetTerms().Length);
                        //float coord = sim.coord()
                        //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
                        Assert.IsTrue(vector != null);
                        System.String[] vTerms = vector.GetTerms();
                        int[]           freqs  = vector.GetTermFrequencies();
                        for (int i = 0; i < vTerms.Length; i++)
                        {
                            if (term.Text().Equals(vTerms[i]) == true)
                            {
                                Assert.IsTrue(freqs[i] == freq);
                            }
                        }
                    }
                    //System.out.println("--------");
                }
                Query query = new TermQuery(new Term("Field", "chocolate"));
                Hits  hits  = knownSearcher.Search(query);
                //doc 3 should be the first hit b/c it is the shortest match
                Assert.IsTrue(hits.Length() == 3);
                float score = hits.Score(0);

                /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
                 * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
                 * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " +  hits.doc(2).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
                Assert.IsTrue(testDoc3.ToString().Equals(hits.Doc(0).ToString()));
                Assert.IsTrue(testDoc4.ToString().Equals(hits.Doc(1).ToString()));
                Assert.IsTrue(testDoc1.ToString().Equals(hits.Doc(2).ToString()));
                TermFreqVector vector2 = knownSearcher.reader.GetTermFreqVector(hits.Id(1), "Field");
                Assert.IsTrue(vector2 != null);
                //System.out.println("Vector: " + vector);
                System.String[] terms  = vector2.GetTerms();
                int[]           freqs2 = vector2.GetTermFrequencies();
                Assert.IsTrue(terms != null && terms.Length == 10);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    //System.out.println("Term: " + term);
                    int freq = freqs2[i];
                    Assert.IsTrue(test4.IndexOf(term) != -1);
                    System.Int32  freqInt    = (System.Int32)test4Map[term];
                    System.Object tmpFreqInt = test4Map[term];
                    Assert.IsTrue(tmpFreqInt != null);
                    Assert.IsTrue(freqInt == freq);
                }
                knownSearcher.Close();
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }
Example #24
0
        public virtual void  TestKnownSetOfDocuments()
        {
            System.String test1 = "eating chocolate in a computer lab";                                             //6 terms
            System.String test2 = "computer in a computer lab";                                                     //5 terms
            System.String test3 = "a chocolate lab grows old";                                                      //5 terms
            System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
            System.Collections.IDictionary test4Map = new System.Collections.Hashtable();
            test4Map["chocolate"] = 3;
            test4Map["lab"]       = 2;
            test4Map["eating"]    = 1;
            test4Map["computer"]  = 1;
            test4Map["with"]      = 1;
            test4Map["a"]         = 1;
            test4Map["colored"]   = 1;
            test4Map["in"]        = 1;
            test4Map["an"]        = 1;
            test4Map["computer"]  = 1;
            test4Map["old"]       = 1;

            Document testDoc1 = new Document();

            SetupDoc(testDoc1, test1);
            Document testDoc2 = new Document();

            SetupDoc(testDoc2, test2);
            Document testDoc3 = new Document();

            SetupDoc(testDoc3, test3);
            Document testDoc4 = new Document();

            SetupDoc(testDoc4, test4);

            Directory dir = new MockRAMDirectory();

            try
            {
                IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
                Assert.IsTrue(writer != null);
                writer.AddDocument(testDoc1);
                writer.AddDocument(testDoc2);
                writer.AddDocument(testDoc3);
                writer.AddDocument(testDoc4);
                writer.Close();
                IndexSearcher knownSearcher = new IndexSearcher(dir);
                TermEnum      termEnum      = knownSearcher.reader_ForNUnit.Terms();
                TermDocs      termDocs      = knownSearcher.reader_ForNUnit.TermDocs();
                //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);

                Similarity sim = knownSearcher.GetSimilarity();
                while (termEnum.Next() == true)
                {
                    Term term = termEnum.Term();
                    //System.out.println("Term: " + term);
                    termDocs.Seek(term);
                    while (termDocs.Next())
                    {
                        int docId = termDocs.Doc();
                        int freq  = termDocs.Freq();
                        //System.out.println("Doc Id: " + docId + " freq " + freq);
                        TermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field");
                        float          tf     = sim.Tf(freq);
                        float          idf    = sim.Idf(term, knownSearcher);
                        //float qNorm = sim.queryNorm()
                        //This is fine since we don't have stop words
                        float lNorm = sim.LengthNorm("field", vector.GetTerms().Length);
                        //float coord = sim.coord()
                        //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
                        Assert.IsTrue(vector != null);
                        System.String[] vTerms = vector.GetTerms();
                        int[]           freqs  = vector.GetTermFrequencies();
                        for (int i = 0; i < vTerms.Length; i++)
                        {
                            if (term.Text().Equals(vTerms[i]))
                            {
                                Assert.IsTrue(freqs[i] == freq);
                            }
                        }
                    }
                    //System.out.println("--------");
                }
                Query      query = new TermQuery(new Term("field", "chocolate"));
                ScoreDoc[] hits  = knownSearcher.Search(query, null, 1000).scoreDocs;
                //doc 3 should be the first hit b/c it is the shortest match
                Assert.IsTrue(hits.Length == 3);
                float score = hits[0].score;

                /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
                 * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
                 * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " +  hits.doc(2).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
                Assert.IsTrue(hits[0].doc == 2);
                Assert.IsTrue(hits[1].doc == 3);
                Assert.IsTrue(hits[2].doc == 0);
                TermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, "field");
                Assert.IsTrue(vector2 != null);
                //System.out.println("Vector: " + vector);
                System.String[] terms  = vector2.GetTerms();
                int[]           freqs2 = vector2.GetTermFrequencies();
                Assert.IsTrue(terms != null && terms.Length == 10);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    //System.out.println("Term: " + term);
                    int freq = freqs2[i];
                    Assert.IsTrue(test4.IndexOf(term) != -1);
                    System.Int32 freqInt = -1;
                    try
                    {
                        freqInt = (System.Int32)test4Map[term];
                    }
                    catch (Exception)
                    {
                        Assert.IsTrue(false);
                    }
                    Assert.IsTrue(freqInt == freq);
                }
                SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
                knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, mapper);
                System.Collections.Generic.SortedDictionary <object, object> vectorEntrySet = mapper.GetTermVectorEntrySet();
                Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10);
                TermVectorEntry last = null;
                foreach (TermVectorEntry tve in vectorEntrySet.Keys)
                {
                    if (tve != null && last != null)
                    {
                        Assert.IsTrue(last.GetFrequency() >= tve.GetFrequency(), "terms are not properly sorted");
                        System.Int32 expectedFreq = (System.Int32)test4Map[tve.GetTerm()];
                        //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
                        Assert.IsTrue(tve.GetFrequency() == 2 * expectedFreq, "Frequency is not correct:");
                    }
                    last = tve;
                }

                FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
                knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, fieldMapper);
                System.Collections.IDictionary map = fieldMapper.GetFieldToTerms();
                Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2);
                vectorEntrySet = (System.Collections.Generic.SortedDictionary <Object, Object>)map["field"];
                Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be");
                Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10);
                knownSearcher.Close();
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }
		private static Hit CreateHit ( Document primary_doc,
					IndexReader secondary_reader,
					TermDocs term_docs,
					FieldSelector fields)
		{
			Hit hit = DocumentToHit (primary_doc);

			if (secondary_reader == null)
				return hit;

			// Get the stringified version of the URI
			// exactly as it comes out of the index.
			Term term = new Term ("Uri", primary_doc.Get ("Uri"));
			term_docs.Seek (term);

			// Move to the first (and only) matching term doc
			term_docs.Next ();
			Document secondary_doc =
				(fields == null) ?
				secondary_reader.Document (term_docs.Doc ()) :
				secondary_reader.Document (term_docs.Doc (), fields);

			// If we are using the secondary index, now we need to
			// merge the properties from the secondary index
			AddPropertiesToHit (hit, secondary_doc, false);

			return hit;
		}
Example #26
0
        public static void  VerifyEquals(IndexReader r1, IndexReader r2, System.String idField)
        {
            Assert.AreEqual(r1.NumDocs(), r2.NumDocs());
            bool hasDeletes = !(r1.MaxDoc() == r2.MaxDoc() && r1.NumDocs() == r1.MaxDoc());

            int[] r2r1 = new int[r2.MaxDoc()];             // r2 id to r1 id mapping

            TermDocs termDocs1 = r1.TermDocs();
            TermDocs termDocs2 = r2.TermDocs();

            // create mapping from id2 space to id2 based on idField
            idField = StringHelper.Intern(idField);
            TermEnum termEnum = r1.Terms(new Term(idField, ""));

            do
            {
                Term term = termEnum.Term();
                if (term == null || (System.Object)term.Field() != (System.Object)idField)
                {
                    break;
                }

                termDocs1.Seek(termEnum);
                if (!termDocs1.Next())
                {
                    // This doc is deleted and wasn't replaced
                    termDocs2.Seek(termEnum);
                    Assert.IsFalse(termDocs2.Next());
                    continue;
                }

                int id1 = termDocs1.Doc();
                Assert.IsFalse(termDocs1.Next());

                termDocs2.Seek(termEnum);
                Assert.IsTrue(termDocs2.Next());
                int id2 = termDocs2.Doc();
                Assert.IsFalse(termDocs2.Next());

                r2r1[id2] = id1;

                // verify stored fields are equivalent
                try
                {
                    VerifyEquals(r1.Document(id1), r2.Document(id2));
                }
                catch (System.Exception t)
                {
                    System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term);
                    System.Console.Out.WriteLine("  d1=" + r1.Document(id1));
                    System.Console.Out.WriteLine("  d2=" + r2.Document(id2));
                    throw t;
                }

                try
                {
                    // verify term vectors are equivalent
                    VerifyEquals(r1.GetTermFreqVectors(id1), r2.GetTermFreqVectors(id2));
                }
                catch (System.Exception e)
                {
                    System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2);
                    TermFreqVector[] tv1 = r1.GetTermFreqVectors(id1);
                    System.Console.Out.WriteLine("  d1=" + tv1);
                    if (tv1 != null)
                    {
                        for (int i = 0; i < tv1.Length; i++)
                        {
                            System.Console.Out.WriteLine("    " + i + ": " + tv1[i]);
                        }
                    }

                    TermFreqVector[] tv2 = r2.GetTermFreqVectors(id2);
                    System.Console.Out.WriteLine("  d2=" + tv2);
                    if (tv2 != null)
                    {
                        for (int i = 0; i < tv2.Length; i++)
                        {
                            System.Console.Out.WriteLine("    " + i + ": " + tv2[i]);
                        }
                    }

                    throw e;
                }
            }while (termEnum.Next());

            termEnum.Close();

            // Verify postings
            TermEnum termEnum1 = r1.Terms(new Term("", ""));
            TermEnum termEnum2 = r2.Terms(new Term("", ""));

            // pack both doc and freq into single element for easy sorting
            long[] info1 = new long[r1.NumDocs()];
            long[] info2 = new long[r2.NumDocs()];

            for (; ;)
            {
                Term term1, term2;

                // iterate until we get some docs
                int len1;
                for (; ;)
                {
                    len1  = 0;
                    term1 = termEnum1.Term();
                    if (term1 == null)
                    {
                        break;
                    }
                    termDocs1.Seek(termEnum1);
                    while (termDocs1.Next())
                    {
                        int d1 = termDocs1.Doc();
                        int f1 = termDocs1.Freq();
                        info1[len1] = (((long)d1) << 32) | f1;
                        len1++;
                    }
                    if (len1 > 0)
                    {
                        break;
                    }
                    if (!termEnum1.Next())
                    {
                        break;
                    }
                }

                // iterate until we get some docs
                int len2;
                for (; ;)
                {
                    len2  = 0;
                    term2 = termEnum2.Term();
                    if (term2 == null)
                    {
                        break;
                    }
                    termDocs2.Seek(termEnum2);
                    while (termDocs2.Next())
                    {
                        int d2 = termDocs2.Doc();
                        int f2 = termDocs2.Freq();
                        info2[len2] = (((long)r2r1[d2]) << 32) | f2;
                        len2++;
                    }
                    if (len2 > 0)
                    {
                        break;
                    }
                    if (!termEnum2.Next())
                    {
                        break;
                    }
                }

                if (!hasDeletes)
                {
                    Assert.AreEqual(termEnum1.DocFreq(), termEnum2.DocFreq());
                }

                Assert.AreEqual(len1, len2);
                if (len1 == 0)
                {
                    break;                     // no more terms
                }
                Assert.AreEqual(term1, term2);

                // sort info2 to get it into ascending docid
                System.Array.Sort(info2, 0, len2 - 0);

                // now compare
                for (int i = 0; i < len1; i++)
                {
                    Assert.AreEqual(info1[i], info2[i]);
                }

                termEnum1.Next();
                termEnum2.Next();
            }
        }
        public virtual void  testSkipTo(int indexDivisor)
        {
            Directory   dir    = new RAMDirectory();
            IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);

            Term ta = new Term("content", "aaa");

            for (int i = 0; i < 10; i++)
            {
                AddDoc(writer, "aaa aaa aaa aaa");
            }

            Term tb = new Term("content", "bbb");

            for (int i = 0; i < 16; i++)
            {
                AddDoc(writer, "bbb bbb bbb bbb");
            }

            Term tc = new Term("content", "ccc");

            for (int i = 0; i < 50; i++)
            {
                AddDoc(writer, "ccc ccc ccc ccc");
            }

            // assure that we deal with a single segment
            writer.Optimize();
            writer.Close();

            IndexReader reader = IndexReader.Open(dir, null, true, indexDivisor);

            TermDocs tdocs = reader.TermDocs();

            // without optimization (assumption skipInterval == 16)

            // with next
            tdocs.Seek(ta);
            Assert.IsTrue(tdocs.Next());
            Assert.AreEqual(0, tdocs.Doc());
            Assert.AreEqual(4, tdocs.Freq());
            Assert.IsTrue(tdocs.Next());
            Assert.AreEqual(1, tdocs.Doc());
            Assert.AreEqual(4, tdocs.Freq());
            Assert.IsTrue(tdocs.SkipTo(0));
            Assert.AreEqual(2, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(4));
            Assert.AreEqual(4, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(9));
            Assert.AreEqual(9, tdocs.Doc());
            Assert.IsFalse(tdocs.SkipTo(10));

            // without next
            tdocs.Seek(ta);
            Assert.IsTrue(tdocs.SkipTo(0));
            Assert.AreEqual(0, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(4));
            Assert.AreEqual(4, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(9));
            Assert.AreEqual(9, tdocs.Doc());
            Assert.IsFalse(tdocs.SkipTo(10));

            // exactly skipInterval documents and therefore with optimization

            // with next
            tdocs.Seek(tb);
            Assert.IsTrue(tdocs.Next());
            Assert.AreEqual(10, tdocs.Doc());
            Assert.AreEqual(4, tdocs.Freq());
            Assert.IsTrue(tdocs.Next());
            Assert.AreEqual(11, tdocs.Doc());
            Assert.AreEqual(4, tdocs.Freq());
            Assert.IsTrue(tdocs.SkipTo(5));
            Assert.AreEqual(12, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(15));
            Assert.AreEqual(15, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(24));
            Assert.AreEqual(24, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(25));
            Assert.AreEqual(25, tdocs.Doc());
            Assert.IsFalse(tdocs.SkipTo(26));

            // without next
            tdocs.Seek(tb);
            Assert.IsTrue(tdocs.SkipTo(5));
            Assert.AreEqual(10, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(15));
            Assert.AreEqual(15, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(24));
            Assert.AreEqual(24, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(25));
            Assert.AreEqual(25, tdocs.Doc());
            Assert.IsFalse(tdocs.SkipTo(26));

            // much more than skipInterval documents and therefore with optimization

            // with next
            tdocs.Seek(tc);
            Assert.IsTrue(tdocs.Next());
            Assert.AreEqual(26, tdocs.Doc());
            Assert.AreEqual(4, tdocs.Freq());
            Assert.IsTrue(tdocs.Next());
            Assert.AreEqual(27, tdocs.Doc());
            Assert.AreEqual(4, tdocs.Freq());
            Assert.IsTrue(tdocs.SkipTo(5));
            Assert.AreEqual(28, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(40));
            Assert.AreEqual(40, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(57));
            Assert.AreEqual(57, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(74));
            Assert.AreEqual(74, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(75));
            Assert.AreEqual(75, tdocs.Doc());
            Assert.IsFalse(tdocs.SkipTo(76));

            //without next
            tdocs.Seek(tc);
            Assert.IsTrue(tdocs.SkipTo(5));
            Assert.AreEqual(26, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(40));
            Assert.AreEqual(40, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(57));
            Assert.AreEqual(57, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(74));
            Assert.AreEqual(74, tdocs.Doc());
            Assert.IsTrue(tdocs.SkipTo(75));
            Assert.AreEqual(75, tdocs.Doc());
            Assert.IsFalse(tdocs.SkipTo(76));

            tdocs.Close();
            reader.Close();
            dir.Close();
        }
        private IndexerReceipt [] Flush_Unlocked(IndexerRequest request)
        {
            ArrayList receipt_queue;

            receipt_queue = new ArrayList();

            IndexReader primary_reader, secondary_reader;

            primary_reader   = IndexReader.Open(PrimaryStore);
            secondary_reader = IndexReader.Open(SecondaryStore);

            // Step #1: Make our first pass over the list of
            // indexables that make up our request.  For each add
            // or property change in the request, get the Lucene
            // documents so we can move forward any persistent
            // properties (for adds) or all old properties (for
            // property changes).
            //
            // Then, for each add or remove in the request,
            // delete the associated documents from the index.
            // Note that we previously cached added documents so
            // that we can move persistent properties forward.

            // parent_child_old_props is double-nested hashtable (depth-2 tree)
            // indexed by the parent uri, it stores another hashtable indexed by the (parent+child documents)
            // FIXME: 2-level hashtable is a waste for any non-child document.
            // Replace this by a better data structure.
            Hashtable parent_child_old_props = UriFu.NewHashtable();
            TermDocs  term_docs    = secondary_reader.TermDocs();
            int       delete_count = 0;

            IEnumerable request_indexables = request.Indexables;

            foreach (Indexable indexable in request_indexables)
            {
                string uri_str = UriFu.UriToEscapedString(indexable.Uri);
                Term   term;

                // Store the necessary properties from old documents for re-addition
                if (indexable.Type == IndexableType.Add ||
                    indexable.Type == IndexableType.PropertyChange)
                {
                    term = new Term("Uri", uri_str);
                    term_docs.Seek(term);

                    Hashtable this_parent_child_props = null;

                    if (term_docs.Next())
                    {
                        this_parent_child_props = UriFu.NewHashtable();
                        this_parent_child_props [indexable.Uri] = secondary_reader.Document(term_docs.Doc());
                        parent_child_old_props [indexable.Uri]  = this_parent_child_props;
                    }

                    term = new Term("ParentUri", uri_str);
                    term_docs.Seek(term);

                    while (term_docs.Next())
                    {
                        Document doc = secondary_reader.Document(term_docs.Doc());

                        string child_uri_str = doc.Get("Uri");
                        Uri    child_uri     = UriFu.EscapedStringToUri(child_uri_str);
                        // Any valid lucene document *should* have a Uri, so no need to check for null
                        // Store the child documents too, to save persistent-properties
                        // of child documents
                        this_parent_child_props [child_uri] = doc;
                    }
                }

                // Now remove (non-remove indexables will be re-added in next block)
                Logger.Log.Debug("-{0}", indexable.DisplayUri);

                int num_delete = 0;

                term = new Term("Uri", uri_str);
                // For property changes, only secondary index is modified
                secondary_reader.DeleteDocuments(term);

                // Now remove from everywhere else (if asked to remove or if asked to add, in which case
                // we first remove and then add)
                // So we also need to remove child documents
                if (indexable.Type != IndexableType.PropertyChange)
                {
                    num_delete = primary_reader.DeleteDocuments(term);

                    // When we delete an indexable, also delete any children.
                    // FIXME: Shouldn't we also delete any children of children, etc.?
                    term        = new Term("ParentUri", uri_str);
                    num_delete += primary_reader.DeleteDocuments(term);
                    secondary_reader.DeleteDocuments(term);
                }

                // If this is a strict removal (and not a deletion that
                // we are doing in anticipation of adding something back),
                // queue up a removed receipt.
                if (indexable.Type == IndexableType.Remove)
                {
                    IndexerRemovedReceipt r;
                    r            = new IndexerRemovedReceipt(indexable.Id);
                    r.NumRemoved = num_delete;
                    receipt_queue.Add(r);
                }

                delete_count += num_delete;
            }

            term_docs.Close();

            if (HaveItemCount)
            {
                AdjustItemCount(-delete_count);
            }
            else
            {
                SetItemCount(primary_reader);
            }

            // We are now done with the readers, so we close them.
            // And also free them. Somehow not freeing them is preventing them from
            // GCed at all.
            primary_reader.Close();
            primary_reader = null;
            secondary_reader.Close();
            secondary_reader = null;

            // FIXME: If we crash at exactly this point, we are in
            // trouble.  Items will have been dropped from the index
            // without the proper replacements being added.  We can
            // hopefully fix this when we move to Lucene 2.1.

            // Step #2: Make another pass across our list of indexables
            // and write out any new documents.

            if (text_cache != null)
            {
                text_cache.BeginTransaction();
            }

            IndexWriter primary_writer, secondary_writer;

            // FIXME: Lock obtain time-out can happen here; if that happens,
            // an exception will be thrown and this method will break in the middle
            // leaving IndexWriters unclosed! Same for any Lucene.Net-index modification
            // methods.
            primary_writer   = new IndexWriter(PrimaryStore, IndexingAnalyzer, false);
            secondary_writer = null;

            foreach (Indexable indexable in request_indexables)
            {
                // If shutdown has been started, break here
                // FIXME: Some more processing will continue, a lot of them
                // concerning receipts, but the daemon will anyway ignore receipts
                // now, what is the fastest way to stop from here ?
                if (Shutdown.ShutdownRequested)
                {
                    Log.Debug("Shutdown initiated. Breaking while flushing indexables.");
                    break;
                }

                // Receipts for removes were generated in the
                // previous block.  Now we just have to remove
                // items from the text cache.
                if (indexable.Type == IndexableType.Remove)
                {
                    if (text_cache != null)
                    {
                        text_cache.Delete(indexable.Uri);
                    }

                    continue;
                }

                IndexerAddedReceipt r;
                Hashtable           prop_change_docs = (Hashtable)parent_child_old_props [indexable.Uri];

                if (indexable.Type == IndexableType.PropertyChange)
                {
                    Logger.Log.Debug("+{0} (props only)", indexable.DisplayUri);

                    r = new IndexerAddedReceipt(indexable.Id);
                    r.PropertyChangesOnly = true;
                    receipt_queue.Add(r);

                    Document doc;
                    if (prop_change_docs == null)
                    {
                        doc = null;
                    }
                    else
                    {
                        doc = (Document)prop_change_docs [indexable.Uri];
                    }

                    Document new_doc;
                    new_doc = RewriteDocument(doc, indexable);

                    // Write out the new document...
                    if (secondary_writer == null)
                    {
                        secondary_writer = new IndexWriter(SecondaryStore, IndexingAnalyzer, false);
                    }
                    secondary_writer.AddDocument(new_doc);

                    // Get child property change indexables...
                    ArrayList prop_change_indexables;
                    prop_change_indexables = GetChildPropertyChange(prop_change_docs, indexable);
                    // and store them; no need to delete them first, since they were already removed from the index
                    if (prop_change_indexables == null)
                    {
                        continue;
                    }

                    foreach (Indexable prop_change_indexable in prop_change_indexables)
                    {
                        Log.Debug("+{0} (props only, generated indexable)", prop_change_indexable.Uri);
                        doc     = (Document)prop_change_docs [prop_change_indexable.Uri];
                        new_doc = RewriteDocument(doc, prop_change_indexable);
                        secondary_writer.AddDocument(new_doc);
                    }

                    continue;                     // ...and proceed to the next Indexable
                }

                // If we reach this point we know we are dealing with an IndexableType.Add

                if (indexable.Type != IndexableType.Add)
                {
                    throw new Exception("When I said it was an IndexableType.Add, I meant it!");
                }

                r = AddIndexableToIndex(indexable, primary_writer, ref secondary_writer, prop_change_docs);
                if (r != null)
                {
                    receipt_queue.Add(r);
                }
            }

            if (text_cache != null)
            {
                text_cache.CommitTransaction();
            }

            if (Shutdown.ShutdownRequested)
            {
                foreach (DeferredInfo di in deferred_indexables)
                {
                    di.Cleanup();
                }
                deferred_indexables.Clear();

                foreach (Indexable indexable in request_indexables)
                {
                    indexable.Cleanup();
                }

                primary_writer.Close();
                if (secondary_writer != null)
                {
                    secondary_writer.Close();
                }

                return(null);
            }

            if (request.OptimizeIndex)
            {
                Stopwatch watch = new Stopwatch();
                Logger.Log.Debug("Optimizing {0}", IndexName);
                watch.Start();
                primary_writer.Optimize();
                if (secondary_writer == null)
                {
                    secondary_writer = new IndexWriter(SecondaryStore, IndexingAnalyzer, false);
                }
                secondary_writer.Optimize();
                watch.Stop();
                Logger.Log.Debug("{0} optimized in {1}", IndexName, watch);
            }

            // Step #4. Close our writers and return the events to
            // indicate what has happened.

            primary_writer.Close();
            if (secondary_writer != null)
            {
                secondary_writer.Close();
            }

            // Send a single IndexerIndexablesReceipt if there were deferred indexables
            if (deferred_indexables.Count > 0)
            {
                Log.Debug("{0} indexables generated more indexables; asking daemon to schedule their indexing.", deferred_indexables.Count);
                IndexerIndexablesReceipt r = new IndexerIndexablesReceipt();
                receipt_queue.Add(r);
            }

            IndexerReceipt [] receipt_array;
            receipt_array = new IndexerReceipt [receipt_queue.Count];
            for (int i = 0; i < receipt_queue.Count; ++i)
            {
                receipt_array [i] = (IndexerReceipt)receipt_queue [i];
            }

            return(receipt_array);
        }
Example #29
0
        public override void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea)
        {
            long t0     = System.Environment.TickCount;
            int  maxdoc = reader.MaxDoc;

            BigNestedIntArray.BufferedLoader loader       = GetBufferedLoader(maxdoc, workArea);
            BigNestedIntArray.BufferedLoader weightLoader = GetBufferedLoader(maxdoc, null);

            TermEnum   tenum              = null;
            TermDocs   tdoc               = null;
            var        list               = (listFactory == null ? new TermStringList() : listFactory.CreateTermList());
            List <int> minIDList          = new List <int>();
            List <int> maxIDList          = new List <int>();
            List <int> freqList           = new List <int>();
            OpenBitSet bitset             = new OpenBitSet(maxdoc + 1);
            int        negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName));
            int        t = 0; // current term number

            list.Add(null);
            minIDList.Add(-1);
            maxIDList.Add(-1);
            freqList.Add(0);
            t++;

            _overflow = false;

            string pre = null;

            int df    = 0;
            int minID = -1;
            int maxID = -1;
            int valId = 0;

            try
            {
                tdoc  = reader.TermDocs();
                tenum = reader.Terms(new Term(fieldName, ""));
                if (tenum != null)
                {
                    do
                    {
                        Term term = tenum.Term;
                        if (term == null || !fieldName.Equals(term.Field))
                        {
                            break;
                        }

                        string val = term.Text;

                        if (val != null)
                        {
                            int      weight = 0;
                            string[] split  = val.Split(new char[] { '\0' }, StringSplitOptions.RemoveEmptyEntries);
                            if (split.Length > 1)
                            {
                                val    = split[0];
                                weight = int.Parse(split[split.Length - 1]);
                            }
                            if (pre == null || !val.Equals(pre))
                            {
                                if (pre != null)
                                {
                                    freqList.Add(df);
                                    minIDList.Add(minID);
                                    maxIDList.Add(maxID);
                                }

                                list.Add(val);

                                df    = 0;
                                minID = -1;
                                maxID = -1;
                                valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
                                t++;
                            }

                            tdoc.Seek(tenum);
                            if (tdoc.Next())
                            {
                                df++;
                                int docid = tdoc.Doc;

                                if (!loader.Add(docid, valId))
                                {
                                    LogOverflow(fieldName);
                                }
                                else
                                {
                                    weightLoader.Add(docid, weight);
                                }

                                if (docid < minID)
                                {
                                    minID = docid;
                                }
                                bitset.FastSet(docid);
                                while (tdoc.Next())
                                {
                                    df++;
                                    docid = tdoc.Doc;

                                    if (!loader.Add(docid, valId))
                                    {
                                        LogOverflow(fieldName);
                                    }
                                    else
                                    {
                                        weightLoader.Add(docid, weight);
                                    }

                                    bitset.FastSet(docid);
                                }
                                if (docid > maxID)
                                {
                                    maxID = docid;
                                }
                            }
                            pre = val;
                        }
                    }while (tenum.Next());
                    if (pre != null)
                    {
                        freqList.Add(df);
                        minIDList.Add(minID);
                        maxIDList.Add(maxID);
                    }
                }
            }
            finally
            {
                try
                {
                    if (tdoc != null)
                    {
                        tdoc.Dispose();
                    }
                }
                finally
                {
                    if (tenum != null)
                    {
                        tenum.Dispose();
                    }
                }
            }

            list.Seal();

            try
            {
                _nestedArray.Load(maxdoc + 1, loader);
                _weightArray.Load(maxdoc + 1, weightLoader);
            }
            catch (System.IO.IOException e)
            {
                throw e;
            }
            catch (Exception e)
            {
                throw new RuntimeException("failed to load due to " + e.ToString(), e);
            }

            this.valArray = list;
            this.freqs    = freqList.ToArray();
            this.minIDs   = minIDList.ToArray();
            this.maxIDs   = maxIDList.ToArray();

            int doc = 0;

            while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true))
            {
                ++doc;
            }
            if (doc <= maxdoc)
            {
                this.minIDs[0] = doc;
                doc            = maxdoc;
                while (doc > 0 && !_nestedArray.Contains(doc, 0, true))
                {
                    --doc;
                }
                if (doc > 0)
                {
                    this.maxIDs[0] = doc;
                }
            }
            this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality();
        }
        /// <summary>
        /// loads multi-value facet data. This method uses a workarea to prepare loading.
        /// </summary>
        /// <param name="fieldName"></param>
        /// <param name="reader"></param>
        /// <param name="listFactory"></param>
        /// <param name="workArea"></param>
        public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea)
        {
            long t0     = Environment.TickCount;
            int  maxdoc = reader.MaxDoc;

            BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea);

            TermEnum       tenum              = null;
            TermDocs       tdoc               = null;
            ITermValueList list               = (listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList());
            List <int>     minIDList          = new List <int>();
            List <int>     maxIDList          = new List <int>();
            List <int>     freqList           = new List <int>();
            OpenBitSet     bitset             = new OpenBitSet();
            int            negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName));
            int            t = 0; // current term number

            list.Add(null);
            minIDList.Add(-1);
            maxIDList.Add(-1);
            freqList.Add(0);
            t++;

            _overflow = false;
            try
            {
                tdoc  = reader.TermDocs();
                tenum = reader.Terms(new Term(fieldName, ""));
                if (tenum != null)
                {
                    do
                    {
                        Term term = tenum.Term;
                        if (term == null || !fieldName.Equals(term.Field))
                        {
                            break;
                        }

                        string val = term.Text;

                        if (val != null)
                        {
                            list.Add(val);

                            tdoc.Seek(tenum);
                            //freqList.add(tenum.docFreq()); // removed because the df doesn't take into account the num of deletedDocs
                            int df    = 0;
                            int minID = -1;
                            int maxID = -1;
                            int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
                            if (tdoc.Next())
                            {
                                df++;
                                int docid = tdoc.Doc;

                                if (!loader.Add(docid, valId))
                                {
                                    LogOverflow(fieldName);
                                }
                                minID = docid;
                                bitset.Set(docid);
                                while (tdoc.Next())
                                {
                                    df++;
                                    docid = tdoc.Doc;

                                    if (!loader.Add(docid, valId))
                                    {
                                        LogOverflow(fieldName);
                                    }
                                    bitset.Set(docid);
                                }
                                maxID = docid;
                            }
                            freqList.Add(df);
                            minIDList.Add(minID);
                            maxIDList.Add(maxID);
                        }

                        t++;
                    }while (tenum.Next());
                }
            }
            finally
            {
                try
                {
                    if (tdoc != null)
                    {
                        tdoc.Dispose();
                    }
                }
                finally
                {
                    if (tenum != null)
                    {
                        tenum.Dispose();
                    }
                }
            }

            list.Seal();

            try
            {
                _nestedArray.Load(maxdoc + 1, loader);
            }
            catch (System.IO.IOException e)
            {
                throw e;
            }
            catch (Exception e)
            {
                throw new RuntimeException("failed to load due to " + e.ToString(), e);
            }

            this.valArray = list;
            this.freqs    = freqList.ToArray();
            this.minIDs   = minIDList.ToArray();
            this.maxIDs   = maxIDList.ToArray();

            int doc = 0;

            while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true))
            {
                ++doc;
            }
            if (doc <= maxdoc)
            {
                this.minIDs[0] = doc;
                doc            = maxdoc;
                while (doc > 0 && !_nestedArray.Contains(doc, 0, true))
                {
                    --doc;
                }
                if (doc > 0)
                {
                    this.maxIDs[0] = doc;
                }
            }
            this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality();
        }
Example #31
0
        public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory)
        {
            string field  = string.Intern(fieldName);
            int    maxDoc = reader.MaxDoc;

            BigSegmentedArray order = this.orderArray;

            if (order == null) // we want to reuse the memory
            {
                int dictValueCount = GetDictValueCount(reader, fieldName);
                order = NewInstance(dictValueCount, maxDoc);
            }
            else
            {
                order.EnsureCapacity(maxDoc); // no need to fill to 0, we are reseting the
                                              // data anyway
            }
            this.orderArray = order;

            List <int> minIDList = new List <int>();
            List <int> maxIDList = new List <int>();
            List <int> freqList  = new List <int>();

            int            length             = maxDoc + 1;
            ITermValueList list               = listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList();
            int            negativeValueCount = GetNegativeValueCount(reader, field);

            TermDocs termDocs = reader.TermDocs();
            TermEnum termEnum = reader.Terms(new Term(field, ""));
            int      t        = 0; // current term number

            list.Add(null);
            minIDList.Add(-1);
            maxIDList.Add(-1);
            freqList.Add(0);
            int totalFreq = 0;

            //int df = 0;
            t++;
            try
            {
                do
                {
                    Term term = termEnum.Term;
                    if (term == null || string.CompareOrdinal(term.Field, field) != 0)
                    {
                        break;
                    }

                    // store term text
                    // we expect that there is at most one term per document

                    // Alexey: well, we could get now more than one term per document. Effectively, we could build facet against tokenized field
                    //if (t >= length)
                    //{
                    //    throw new RuntimeException("there are more terms than " + "documents in field \"" + field
                    //        + "\", but it's impossible to sort on " + "tokenized fields");
                    //}
                    list.Add(term.Text);
                    termDocs.Seek(termEnum);
                    // freqList.add(termEnum.docFreq()); // doesn't take into account deldocs
                    int minID = -1;
                    int maxID = -1;
                    int df    = 0;
                    int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
                    if (termDocs.Next())
                    {
                        df++;
                        int docid = termDocs.Doc;
                        order.Add(docid, valId);
                        minID = docid;
                        while (termDocs.Next())
                        {
                            df++;
                            docid = termDocs.Doc;
                            order.Add(docid, valId);
                        }
                        maxID = docid;
                    }
                    freqList.Add(df);
                    totalFreq += df;
                    minIDList.Add(minID);
                    maxIDList.Add(maxID);

                    t++;
                } while (termEnum.Next());
            }
            finally
            {
                termDocs.Dispose();
                termEnum.Dispose();
            }
            list.Seal();
            this.valArray = list;
            this.freqs    = freqList.ToArray();
            this.minIDs   = minIDList.ToArray();
            this.maxIDs   = maxIDList.ToArray();

            int doc = 0;

            while (doc <= maxDoc && order.Get(doc) != 0)
            {
                ++doc;
            }
            if (doc <= maxDoc)
            {
                this.minIDs[0] = doc;
                // Try to get the max
                doc = maxDoc;
                while (doc > 0 && order.Get(doc) != 0)
                {
                    --doc;
                }
                if (doc > 0)
                {
                    this.maxIDs[0] = doc;
                }
            }
            this.freqs[0] = maxDoc + 1 - totalFreq;
        }