Terms() public abstract méthode

Returns an enumeration of all the terms in the index. The enumeration is ordered by Term.compareTo(). Each term is greater than all that precede it in the enumeration. Note that after calling terms(), TermEnum.Next() must be called on the resulting enumeration before calling other methods such as TermEnum.Term.
public abstract Terms ( ) : TermEnum
Résultat TermEnum
Exemple #1
0
        private void  VerifyDocFreq()
        {
            IndexReader reader   = IndexReader.Open(dir);
            TermEnum    termEnum = null;

            // create enumeration of all terms
            termEnum = reader.Terms();
            // go to the first term (aaa)
            termEnum.Next();
            // assert that term is 'aaa'
            Assert.AreEqual("aaa", termEnum.Term().Text());
            Assert.AreEqual(200, termEnum.DocFreq());
            // go to the second term (bbb)
            termEnum.Next();
            // assert that term is 'bbb'
            Assert.AreEqual("bbb", termEnum.Term().Text());
            Assert.AreEqual(100, termEnum.DocFreq());

            termEnum.Close();


            // create enumeration of terms after term 'aaa', including 'aaa'
            termEnum = reader.Terms(new Term("content", "aaa"));
            // assert that term is 'aaa'
            Assert.AreEqual("aaa", termEnum.Term().Text());
            Assert.AreEqual(200, termEnum.DocFreq());
            // go to term 'bbb'
            termEnum.Next();
            // assert that term is 'bbb'
            Assert.AreEqual("bbb", termEnum.Term().Text());
            Assert.AreEqual(100, termEnum.DocFreq());

            termEnum.Close();
        }
        public MultiTermEnum(IndexReader[] readers, int[] starts, Term t)
        {
            queue = new SegmentMergeQueue(readers.Length);
            for (int i = 0; i < readers.Length; i++)
            {
                IndexReader reader = readers[i];
                TermEnum    termEnum;

                if (t != null)
                {
                    termEnum = reader.Terms(t);
                }
                else
                {
                    termEnum = reader.Terms();
                }

                SegmentMergeInfo smi = new SegmentMergeInfo(starts[i], termEnum, reader);
                if (t == null?smi.Next():termEnum.Term() != null)
                {
                    queue.Put(smi);
                }
                // initialize queue
                else
                {
                    smi.Close();
                }
            }

            if (t != null && queue.Size() > 0)
            {
                Next();
            }
        }
        public virtual int doTest(int iter, int ndocs, int maxTF, float percentDocs)
        {
            Directory dir = new RAMDirectory();

            long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);

            AddDocs(dir, ndocs, "foo", "val", maxTF, percentDocs);
            long end = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);

            System.Console.Out.WriteLine("milliseconds for creation of " + ndocs + " docs = " + (end - start));

            IndexReader reader = IndexReader.Open(dir);
            TermEnum    tenum  = reader.Terms(new Term("foo", "val"));
            TermDocs    tdocs  = reader.TermDocs();

            start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);

            int ret = 0;

            for (int i = 0; i < iter; i++)
            {
                tdocs.Seek(tenum);
                while (tdocs.Next())
                {
                    ret += tdocs.Doc();
                }
            }

            end = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
            System.Console.Out.WriteLine("milliseconds for " + iter + " TermDocs iteration: " + (end - start));

            return(ret);
        }
Exemple #4
0
            protected internal override string[] CreateValue(IndexReader reader, Entry entryKey, IState state)
            {
                System.String   field    = StringHelper.Intern(entryKey.field);
                System.String[] retArray = new System.String[reader.MaxDoc];
                TermDocs        termDocs = reader.TermDocs(state);
                TermEnum        termEnum = reader.Terms(new Term(field), state);

                try
                {
                    do
                    {
                        Term term = termEnum.Term;
                        if (term == null || (System.Object)term.Field != (System.Object)field)
                        {
                            break;
                        }
                        System.String termval = term.Text;
                        termDocs.Seek(termEnum, state);
                        while (termDocs.Next(state))
                        {
                            retArray[termDocs.Doc] = termval;
                        }
                    }while (termEnum.Next(state));
                }
                finally
                {
                    termDocs.Close();
                    termEnum.Close();
                }
                return(retArray);
            }
Exemple #5
0
            protected internal override float[] CreateValue(IndexReader reader, Entry entryKey, IState state)
            {
                Entry entry = entryKey;

                System.String field  = entry.field;
                FloatParser   parser = (FloatParser)entry.custom;

                if (parser == null)
                {
                    try
                    {
                        return(wrapper.GetFloats(reader, field, Lucene.Net.Search.FieldCache_Fields.DEFAULT_FLOAT_PARSER, state));
                    }
                    catch (System.FormatException)
                    {
                        return(wrapper.GetFloats(reader, field, Lucene.Net.Search.FieldCache_Fields.NUMERIC_UTILS_FLOAT_PARSER, state));
                    }
                }
                float[]  retArray = null;
                TermDocs termDocs = reader.TermDocs(state);
                TermEnum termEnum = reader.Terms(new Term(field), state);

                try
                {
                    do
                    {
                        Term term = termEnum.Term;
                        if (term == null || (System.Object)term.Field != (System.Object)field)
                        {
                            break;
                        }
                        float termval = parser.ParseFloat(term.Text);
                        if (retArray == null)
                        {
                            // late init
                            retArray = new float[reader.MaxDoc];
                        }
                        termDocs.Seek(termEnum, state);
                        while (termDocs.Next(state))
                        {
                            retArray[termDocs.Doc] = termval;
                        }
                    }while (termEnum.Next(state));
                }
                catch (StopFillCacheException)
                {
                }
                finally
                {
                    termDocs.Close();
                    termEnum.Close();
                }
                if (retArray == null)
                {
                    // no values
                    retArray = new float[reader.MaxDoc];
                }
                return(retArray);
            }
Exemple #6
0
        public virtual void  TestPhrasePrefix()
        {
            RAMDirectory indexStore = new RAMDirectory();
            IndexWriter  writer     = new IndexWriter(indexStore, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED, null);
            Document     doc1       = new Document();
            Document     doc2       = new Document();
            Document     doc3       = new Document();
            Document     doc4       = new Document();
            Document     doc5       = new Document();

            doc1.Add(new Field("body", "blueberry pie", Field.Store.YES, Field.Index.ANALYZED));
            doc2.Add(new Field("body", "blueberry strudel", Field.Store.YES, Field.Index.ANALYZED));
            doc3.Add(new Field("body", "blueberry pizza", Field.Store.YES, Field.Index.ANALYZED));
            doc4.Add(new Field("body", "blueberry chewing gum", Field.Store.YES, Field.Index.ANALYZED));
            doc5.Add(new Field("body", "piccadilly circus", Field.Store.YES, Field.Index.ANALYZED));
            writer.AddDocument(doc1, null);
            writer.AddDocument(doc2, null);
            writer.AddDocument(doc3, null);
            writer.AddDocument(doc4, null);
            writer.AddDocument(doc5, null);
            writer.Optimize(null);
            writer.Close();

            IndexSearcher searcher = new IndexSearcher(indexStore, true, null);

            //PhrasePrefixQuery query1 = new PhrasePrefixQuery();
            MultiPhraseQuery query1 = new MultiPhraseQuery();
            //PhrasePrefixQuery query2 = new PhrasePrefixQuery();
            MultiPhraseQuery query2 = new MultiPhraseQuery();

            query1.Add(new Term("body", "blueberry"));
            query2.Add(new Term("body", "strawberry"));

            System.Collections.ArrayList termsWithPrefix = new System.Collections.ArrayList();
            IndexReader ir = IndexReader.Open((Directory)indexStore, true, null);

            // this TermEnum gives "piccadilly", "pie" and "pizza".
            System.String prefix = "pi";
            TermEnum      te     = ir.Terms(new Term("body", prefix + "*"), null);

            do
            {
                if (te.Term.Text.StartsWith(prefix))
                {
                    termsWithPrefix.Add(te.Term);
                }
            }while (te.Next(null));

            query1.Add((Term[])termsWithPrefix.ToArray(typeof(Term)));
            query2.Add((Term[])termsWithPrefix.ToArray(typeof(Term)));

            ScoreDoc[] result;
            result = searcher.Search(query1, null, 1000, null).ScoreDocs;
            Assert.AreEqual(2, result.Length);

            result = searcher.Search(query2, null, 1000, null).ScoreDocs;
            Assert.AreEqual(0, result.Length);
        }
Exemple #7
0
        private void  MergeTermInfos()
        {
            int base_Renamed = 0;

            for (int i = 0; i < readers.Count; i++)
            {
                IndexReader      reader   = (IndexReader)readers[i];
                TermEnum         termEnum = reader.Terms();
                SegmentMergeInfo smi      = new SegmentMergeInfo(base_Renamed, termEnum, reader);
                base_Renamed += reader.NumDocs();
                if (smi.Next())
                {
                    queue.Put(smi);
                }
                // initialize queue
                else
                {
                    smi.Close();
                }
            }

            SegmentMergeInfo[] match = new SegmentMergeInfo[readers.Count];

            while (queue.Size() > 0)
            {
                int matchSize = 0;                 // pop matching terms
                match[matchSize++] = (SegmentMergeInfo)queue.Pop();
                Term             term = match[0].term;
                SegmentMergeInfo top  = (SegmentMergeInfo)queue.Top();

                while (top != null && term.CompareTo(top.term) == 0)
                {
                    match[matchSize++] = (SegmentMergeInfo)queue.Pop();
                    top = (SegmentMergeInfo)queue.Top();
                }

                int df = MergeTermInfo(match, matchSize);                 // add new TermInfo

                if (checkAbort != null)
                {
                    checkAbort.Work(df / 3.0);
                }

                while (matchSize > 0)
                {
                    SegmentMergeInfo smi = match[--matchSize];
                    if (smi.Next())
                    {
                        queue.Put(smi);
                    }
                    // restore queue
                    else
                    {
                        smi.Close();                         // done with a segment
                    }
                }
            }
        }
Exemple #8
0
 /// <summary>
 /// Initialization method called by subclasses to simulate a shared
 /// base constructor as generic classes cannot have a parameterized ctor.
 /// </summary>
 /// <param name="reader">The index reader to read from.</param>
 /// <param name="fieldName">The field to enumerate.</param>
 /// <param name="includeDocs">Whether this enumerator will support TermDocs.</param>
 protected void Init(IndexReader reader, string fieldName, bool includeDocs)
 {
     this.termEnum = reader.Terms(new Term(fieldName));
     if (includeDocs)
     {
         this.termDocs = reader.TermDocs();
         this.tdEnum   = new TermDocEnumerator.TermDocUsingTermsEnumerator(this.termDocs, this.termEnum);
     }
     this.tEnum = new TermEnumerator(termEnum, termDocs, fieldName, this);
 }
Exemple #9
0
            public ParallelTermEnum(ParallelReader enclosingInstance, Term term, IState state)
            {
                InitBlock(enclosingInstance);
                field = term.Field;
                IndexReader reader = Enclosing_Instance.fieldToReader[field];

                if (reader != null)
                {
                    termEnum = reader.Terms(term, state);
                }
            }
            public ParallelTermEnum(ParallelReader enclosingInstance, Term term)
            {
                InitBlock(enclosingInstance);
                field = term.Field();
                IndexReader reader = ((IndexReader)Enclosing_Instance.fieldToReader[field]);

                if (reader != null)
                {
                    termEnum = reader.Terms(term);
                }
            }
        public RegexTermEnum(IndexReader reader, Term term, IRegexCapabilities regexImpl)
        {
            _sField = term.Field;
            string sText = term.Text;

            _regexImpl = regexImpl;

            _regexImpl.Compile(sText);

            _sPre = _regexImpl.Prefix() ?? "";

            SetEnum(reader.Terms(new Term(term.Field, _sPre)));
        }
            /// <summary>Increments the enumeration to the next element.  True if one exists. </summary>
            //@Override
            public override bool Next()
            {
                // if a current term exists, the actual enum is initialized:
                // try change to next term, if no such term exists, fall-through
                if (currentTerm != null)
                {
                    System.Diagnostics.Debug.Assert(actualEnum != null);
                    if (actualEnum.Next())
                    {
                        currentTerm = actualEnum.Term;
                        if (TermCompare(currentTerm))
                        {
                            return(true);
                        }
                    }
                }
                // if all above fails, we go forward to the next enum,
                // if one is available
                currentTerm = null;
                while (rangeBounds.Count >= 2)
                {
                    // close the current enum and read next bounds
                    if (actualEnum != null)
                    {
                        actualEnum.Close();
                        actualEnum = null;
                    }
                    string lowerBound = rangeBounds.First.Value;
                    rangeBounds.RemoveFirst();
                    this.currentUpperBound = rangeBounds.First.Value;
                    rangeBounds.RemoveFirst();
                    // create a new enum
                    actualEnum  = reader.Terms(termTemplate.CreateTerm(lowerBound));
                    currentTerm = actualEnum.Term;
                    if (currentTerm != null && TermCompare(currentTerm))
                    {
                        return(true);
                    }
                    // clear the current term for next iteration
                    currentTerm = null;
                }

                // no more sub-range enums available
                System.Diagnostics.Debug.Assert(rangeBounds.Count == 0 && currentTerm == null);
                return(false);
            }
Exemple #13
0
        public virtual void  TestThreadSafety()
        {
            rnd = NewRandom();
            int           numThreads = 5;
            int           numDocs    = 50;
            ByteArrayPool pool       = new ByteArrayPool(numThreads, 5);

            Directory   dir    = new RAMDirectory();
            IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED, null);

            System.String field = "test";

            ThreadClass[] ingesters = new ThreadClass[numThreads];
            for (int i = 0; i < numThreads; i++)
            {
                ingesters[i] = new AnonymousClassThread(numDocs, field, pool, writer, this);
                ingesters[i].Start();
            }

            for (int i = 0; i < numThreads; i++)
            {
                ingesters[i].Join();
            }
            writer.Close();
            IndexReader reader = IndexReader.Open(dir, true, null);
            TermEnum    terms  = reader.Terms(null);

            while (terms.Next(null))
            {
                TermPositions tp = reader.TermPositions(terms.Term, null);
                while (tp.Next(null))
                {
                    int freq = tp.Freq;
                    for (int i = 0; i < freq; i++)
                    {
                        tp.NextPosition(null);
                        Assert.AreEqual(pool.BytesToString(tp.GetPayload(new byte[5], 0, null)), terms.Term.Text);
                    }
                }
                tp.Close();
            }
            terms.Close();
            reader.Close();

            Assert.AreEqual(pool.Size(), numThreads);
        }
        private IEnumerable<string> GetFieldValues(IndexReader reader, string groupByField)
        {
            TermEnum te = reader.Terms(new Term(groupByField, string.Empty));
            if (te.Term() == null || te.Term().Field() != groupByField)
                return Enumerable.Empty<string>();

            var list = new List<string>();
            list.Add(te.Term().Text());

            while (te.Next())
            {
                if (te.Term().Field() != groupByField)
                    break;

                list.Add(te.Term().Text());
            }
            return list;
        }
Exemple #15
0
            protected internal override short[] CreateValue(IndexReader reader, Entry entryKey, IState state)
            {
                Entry entry = entryKey;

                System.String field  = entry.field;
                ShortParser   parser = (ShortParser)entry.custom;

                if (parser == null)
                {
                    return(wrapper.GetShorts(reader, field, Lucene.Net.Search.FieldCache_Fields.DEFAULT_SHORT_PARSER, state));
                }
                short[]  retArray = new short[reader.MaxDoc];
                TermDocs termDocs = reader.TermDocs(state);
                TermEnum termEnum = reader.Terms(new Term(field), state);

                try
                {
                    do
                    {
                        Term term = termEnum.Term;
                        if (term == null || (System.Object)term.Field != (System.Object)field)
                        {
                            break;
                        }
                        short termval = parser.ParseShort(term.Text);
                        termDocs.Seek(termEnum, state);
                        while (termDocs.Next(state))
                        {
                            retArray[termDocs.Doc] = termval;
                        }
                    }while (termEnum.Next(state));
                }
                catch (StopFillCacheException)
                {
                }
                finally
                {
                    termDocs.Close();
                    termEnum.Close();
                }
                return(retArray);
            }
Exemple #16
0
        private static Dictionary<string, int[]> FillCache(IndexReader reader, int docBase, string field)
        {
            using (var termDocs = reader.TermDocs())
            {
                var items = new Dictionary<string, int[]>();
                var docsForTerm = new List<int>();

                using (var termEnum = reader.Terms(new Term(field)))
                {
                    do
                    {
                        if (termEnum.Term == null || field != termEnum.Term.Field)
                            break;

                        Term term = termEnum.Term;
                        if (LowPrecisionNumber(term.Field, term.Text))
                            continue;

                        var totalDocCountIncludedDeletes = termEnum.DocFreq();
                        termDocs.Seek(termEnum.Term);
                        while (termDocs.Next() && totalDocCountIncludedDeletes > 0)
                        {
                            var curDoc = termDocs.Doc;
                            totalDocCountIncludedDeletes -= 1;
                            if (reader.IsDeleted(curDoc))
                                continue;

                            docsForTerm.Add(curDoc + docBase);
                        }
                        docsForTerm.Sort();
                        items[term.Text] = docsForTerm.ToArray();
                        docsForTerm.Clear();
                    } while (termEnum.Next());
                }
                return items;
            }
        }
 private OpenBitSet CorrectBits(IndexReader reader)
 {
     OpenBitSet bits = new OpenBitSet(reader.MaxDoc); //assume all are INvalid
     Term startTerm = new Term(fieldName);
     TermEnum te = reader.Terms(startTerm);
     if (te != null)
     {
         Term currTerm = te.Term;
         while ((currTerm != null) && (currTerm.Field == startTerm.Field)) //term fieldnames are interned
         {
             int lastDoc = -1;
             //set non duplicates
             TermDocs td = reader.TermDocs(currTerm);
             if (td.Next())
             {
                 if (keepMode == KM_USE_FIRST_OCCURRENCE)
                 {
                     bits.Set(td.Doc);
                 }
                 else
                 {
                     do
                     {
                         lastDoc = td.Doc;
                     } while (td.Next());
                     bits.Set(lastDoc);
                 }
             }
             if (!te.Next())
             {
                 break;
             }
             currTerm = te.Term;
         }
     }
     return bits;
 }
		public static void  VerifyEquals(IndexReader r1, IndexReader r2, System.String idField)
		{
			Assert.AreEqual(r1.NumDocs(), r2.NumDocs());
			bool hasDeletes = !(r1.MaxDoc() == r2.MaxDoc() && r1.NumDocs() == r1.MaxDoc());
			
			int[] r2r1 = new int[r2.MaxDoc()]; // r2 id to r1 id mapping
			
			TermDocs termDocs1 = r1.TermDocs();
			TermDocs termDocs2 = r2.TermDocs();
			
			// create mapping from id2 space to id2 based on idField
			idField = StringHelper.Intern(idField);
			TermEnum termEnum = r1.Terms(new Term(idField, ""));
			do 
			{
				Term term = termEnum.Term();
				if (term == null || (System.Object) term.Field() != (System.Object) idField)
					break;
				
				termDocs1.Seek(termEnum);
				if (!termDocs1.Next())
				{
					// This doc is deleted and wasn't replaced
					termDocs2.Seek(termEnum);
					Assert.IsFalse(termDocs2.Next());
					continue;
				}
				
				int id1 = termDocs1.Doc();
				Assert.IsFalse(termDocs1.Next());
				
				termDocs2.Seek(termEnum);
				Assert.IsTrue(termDocs2.Next());
				int id2 = termDocs2.Doc();
				Assert.IsFalse(termDocs2.Next());
				
				r2r1[id2] = id1;
				
				// verify stored fields are equivalent
				try
				{
					VerifyEquals(r1.Document(id1), r2.Document(id2));
				}
				catch (System.Exception t)
				{
					System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term);
					System.Console.Out.WriteLine("  d1=" + r1.Document(id1));
					System.Console.Out.WriteLine("  d2=" + r2.Document(id2));
					throw t;
				}
				
				try
				{
					// verify term vectors are equivalent        
					VerifyEquals(r1.GetTermFreqVectors(id1), r2.GetTermFreqVectors(id2));
				}
				catch (System.Exception e)
				{
					System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2);
					TermFreqVector[] tv1 = r1.GetTermFreqVectors(id1);
					System.Console.Out.WriteLine("  d1=" + tv1);
					if (tv1 != null)
						for (int i = 0; i < tv1.Length; i++)
						{
							System.Console.Out.WriteLine("    " + i + ": " + tv1[i]);
						}
					
					TermFreqVector[] tv2 = r2.GetTermFreqVectors(id2);
					System.Console.Out.WriteLine("  d2=" + tv2);
					if (tv2 != null)
						for (int i = 0; i < tv2.Length; i++)
						{
							System.Console.Out.WriteLine("    " + i + ": " + tv2[i]);
						}
					
					throw e;
				}
			}
			while (termEnum.Next());
			
			termEnum.Close();
			
			// Verify postings
			TermEnum termEnum1 = r1.Terms(new Term("", ""));
			TermEnum termEnum2 = r2.Terms(new Term("", ""));
			
			// pack both doc and freq into single element for easy sorting
			long[] info1 = new long[r1.NumDocs()];
			long[] info2 = new long[r2.NumDocs()];
			
			for (; ; )
			{
				Term term1, term2;
				
				// iterate until we get some docs
				int len1;
				for (; ; )
				{
					len1 = 0;
					term1 = termEnum1.Term();
					if (term1 == null)
						break;
					termDocs1.Seek(termEnum1);
					while (termDocs1.Next())
					{
						int d1 = termDocs1.Doc();
						int f1 = termDocs1.Freq();
						info1[len1] = (((long) d1) << 32) | f1;
						len1++;
					}
					if (len1 > 0)
						break;
					if (!termEnum1.Next())
						break;
				}
				
				// iterate until we get some docs
				int len2;
				for (; ; )
				{
					len2 = 0;
					term2 = termEnum2.Term();
					if (term2 == null)
						break;
					termDocs2.Seek(termEnum2);
					while (termDocs2.Next())
					{
						int d2 = termDocs2.Doc();
						int f2 = termDocs2.Freq();
						info2[len2] = (((long) r2r1[d2]) << 32) | f2;
						len2++;
					}
					if (len2 > 0)
						break;
					if (!termEnum2.Next())
						break;
				}
				
				if (!hasDeletes)
					Assert.AreEqual(termEnum1.DocFreq(), termEnum2.DocFreq());
				
				Assert.AreEqual(len1, len2);
				if (len1 == 0)
					break; // no more terms
				
				Assert.AreEqual(term1, term2);
				
				// sort info2 to get it into ascending docid
				System.Array.Sort(info2, 0, len2 - 0);
				
				// now compare
				for (int i = 0; i < len1; i++)
				{
					Assert.AreEqual(info1[i], info2[i]);
				}
				
				termEnum1.Next();
				termEnum2.Next();
			}
		}
 /// <summary>
 /// Creates a new <c>SingleTermEnum</c>.
 /// <p/>
 /// After calling the constructor the enumeration is already pointing to the term,
 ///  if it exists.
 /// </summary>
 public SingleTermEnum(IndexReader reader, Term singleTerm)
 {
     this.singleTerm = singleTerm;
     SetEnum(reader.Terms(singleTerm));
 }
			protected internal double[] ComputeDistances(IndexReader reader)
			{
				double[] retArray = null;
				var termDocs = reader.TermDocs();
				var termEnum = reader.Terms(new Term(Constants.SpatialShapeFieldName));
				try
				{
					do
					{
						Term term = termEnum.Term();
						if (term == null)
							break;

						Debug.Assert(Constants.SpatialShapeFieldName.Equals(term.Field()));

						Shape termval;
						try
						{
							termval = SpatialIndex.RavenSpatialContext.ReadShape(term.Text()); // read shape
						}
						catch (InvalidShapeException)
						{
							continue;
						}

						var pt = termval as Point;
						if (pt == null)
							continue;

						var distance = SpatialIndex.RavenSpatialContext.GetDistCalc().Distance(pt, originPt);

						if (retArray == null)
							// late init
							retArray = new double[reader.MaxDoc()];
						termDocs.Seek(termEnum);
						while (termDocs.Next())
						{
							retArray[termDocs.Doc()] = distance;
						}
					} while (termEnum.Next());
				}
				finally
				{
					termDocs.Close();
					termEnum.Close();
				}
				return retArray ?? new double[reader.MaxDoc()];
			}
Exemple #21
0
            protected internal override StringIndex CreateValue(IndexReader reader, Entry entryKey, IState state)
            {
                System.String field           = StringHelper.Intern(entryKey.field);
                int[]         retArray        = new int[reader.MaxDoc];
                int[]         retArrayOrdered = new int[reader.MaxDoc];
                for (int i = 0; i < retArrayOrdered.Length; i++)
                {
                    retArrayOrdered[i] = -1;
                }

                var length = reader.MaxDoc + 1;
                UnmanagedStringArray mterms   = new UnmanagedStringArray(length);
                TermDocs             termDocs = reader.TermDocs(state);

                SegmentTermEnum termEnum = (SegmentTermEnum)reader.Terms(new Term(field), state);
                int             t        = 0; // current term number
                int             docIndex = 0;

                // an entry for documents that have no terms in this field
                // should a document with no terms be at top or bottom?
                // this puts them at the top - if it is changed, FieldDocSortedHitQueue
                // needs to change as well.
                t++;

                try
                {
                    do
                    {
                        if (termEnum.termBuffer.Field != field || t >= length)
                        {
                            break;
                        }

                        // store term text
                        mterms.Add(termEnum.termBuffer.TextAsSpan);

                        termDocs.Seek(termEnum, state);
                        while (termDocs.Next(state))
                        {
                            var pt = retArray[termDocs.Doc];
                            retArray[termDocs.Doc] = t;

                            if (pt == 0)
                            {
                                retArrayOrdered[docIndex++] = termDocs.Doc;
                            }
                        }

                        t++;
                    }while (termEnum.Next(state));
                }
                finally
                {
                    termDocs.Close();
                    termEnum.Close();
                }

                StringIndex value_Renamed = new StringIndex(retArray, retArrayOrdered, mterms);

                return(value_Renamed);
            }
Exemple #22
0
 public override TermEnum Terms(IState state)
 {
     EnsureOpen();
     return(in_Renamed.Terms(state));
 }
Exemple #23
0
        private void  MergeTermInfos(FormatPostingsFieldsConsumer consumer)
        {
            int base_Renamed = 0;
            int readerCount  = readers.Count;

            for (int i = 0; i < readerCount; i++)
            {
                IndexReader      reader   = readers[i];
                TermEnum         termEnum = reader.Terms();
                SegmentMergeInfo smi      = new SegmentMergeInfo(base_Renamed, termEnum, reader);
                int[]            docMap   = smi.GetDocMap();
                if (docMap != null)
                {
                    if (docMaps == null)
                    {
                        docMaps   = new int[readerCount][];
                        delCounts = new int[readerCount];
                    }
                    docMaps[i]   = docMap;
                    delCounts[i] = smi.reader.MaxDoc - smi.reader.NumDocs();
                }

                base_Renamed += reader.NumDocs();

                System.Diagnostics.Debug.Assert(reader.NumDocs() == reader.MaxDoc - smi.delCount);

                if (smi.Next())
                {
                    queue.Add(smi);
                }
                // initialize queue
                else
                {
                    smi.Dispose();
                }
            }

            SegmentMergeInfo[] match = new SegmentMergeInfo[readers.Count];

            System.String currentField = null;
            FormatPostingsTermsConsumer termsConsumer = null;

            while (queue.Size() > 0)
            {
                int matchSize = 0; // pop matching terms
                match[matchSize++] = queue.Pop();
                Term             term = match[0].term;
                SegmentMergeInfo top  = queue.Top();

                while (top != null && term.CompareTo(top.term) == 0)
                {
                    match[matchSize++] = queue.Pop();
                    top = queue.Top();
                }

                if ((System.Object)currentField != (System.Object)term.Field)
                {
                    currentField = term.Field;
                    if (termsConsumer != null)
                    {
                        termsConsumer.Finish();
                    }
                    FieldInfo fieldInfo = fieldInfos.FieldInfo(currentField);
                    termsConsumer            = consumer.AddField(fieldInfo);
                    omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions;
                }

                int df = AppendPostings(termsConsumer, match, matchSize); // add new TermInfo

                checkAbort.Work(df / 3.0);

                while (matchSize > 0)
                {
                    SegmentMergeInfo smi = match[--matchSize];
                    if (smi.Next())
                    {
                        queue.Add(smi);
                    }
                    // restore queue
                    else
                    {
                        smi.Dispose(); // done with a segment
                    }
                }
            }
        }
        private static void FillCache(IndexSearcherHolder.IndexSearcherHoldingState state, IEnumerable<string> fieldsToRead,IndexReader reader)
        {
            foreach (var field in fieldsToRead)
            {
	            var items = new LinkedList<IndexSearcherHolder.IndexSearcherHoldingState.CacheVal>[reader.MaxDoc];
                using (var termDocs = reader.TermDocs())
                {
                    using (var termEnum = reader.Terms(new Term(field)))
                    {
                        do
                        {
                            if (termEnum.Term == null || field != termEnum.Term.Field)
                                break;

                            Term term = termEnum.Term;
                            if (LowPrecisionNumber(term.Field, term.Text))
                                continue;


                            var totalDocCountIncludedDeletes = termEnum.DocFreq();
                            termDocs.Seek(termEnum.Term);

                            while (termDocs.Next() && totalDocCountIncludedDeletes > 0)
                            {
                                totalDocCountIncludedDeletes -= 1;
                                if (reader.IsDeleted(termDocs.Doc))
                                    continue;
								if(items[termDocs.Doc] == null)
									items[termDocs.Doc] = new LinkedList<IndexSearcherHolder.IndexSearcherHoldingState.CacheVal>();

	                            items[termDocs.Doc].AddLast(new IndexSearcherHolder.IndexSearcherHoldingState.CacheVal
	                            {
		                            Term = termEnum.Term
	                            });
                            }
                        } while (termEnum.Next());
                    }
                }
	            state.SetInCache(field, items);
            }
        }
Exemple #25
0
        public static RavenJObject[] ReadAllEntriesFromIndex(IndexReader reader)
        {
            if (reader.MaxDoc > 512 * 1024)
            {
                throw new InvalidOperationException("Refusing to extract all index entires from an index with " + reader.MaxDoc +
                                                    " entries, because of the probable time / memory costs associated with that." +
                                                    Environment.NewLine +
                                                    "Viewing Index Entries are a debug tool, and should not be used on indexes of this size. You might want to try Luke, instead.");
            }
            var results = new RavenJObject[reader.MaxDoc];
            using (var termDocs = reader.TermDocs())
            using (var termEnum = reader.Terms())
            {
                while (termEnum.Next())
                {
                    var term = termEnum.Term;
                    if (term == null)
                        break;

                    var text = term.Text;

                    termDocs.Seek(termEnum);
                    for (int i = 0; i < termEnum.DocFreq() && termDocs.Next(); i++)
                    {
                        RavenJObject result = results[termDocs.Doc];
                        if (result == null)
                            results[termDocs.Doc] = result = new RavenJObject();
                        var propertyName = term.Field;
                        if (propertyName.EndsWith("_ConvertToJson") ||
                            propertyName.EndsWith("_IsArray"))
                            continue;
                        if (result.ContainsKey(propertyName))
                        {
                            switch (result[propertyName].Type)
                            {
                                case JTokenType.Array:
                                    ((RavenJArray)result[propertyName]).Add(text);
                                    break;
                                case JTokenType.String:
                                    result[propertyName] = new RavenJArray
									{
										result[propertyName],
										text
									};
                                    break;
                                default:
                                    throw new ArgumentException("No idea how to handle " + result[propertyName].Type);
                            }
                        }
                        else
                        {
                            result[propertyName] = text;
                        }
                    }
                }
            }
            return results;
        }
        /// <summary>
        /// loads multi-value facet data. This method uses a workarea to prepare loading.
        /// </summary>
        /// <param name="fieldName"></param>
        /// <param name="reader"></param>
        /// <param name="listFactory"></param>
        /// <param name="workArea"></param>
        public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea)
        {
            long t0 = Environment.TickCount;
            int maxdoc = reader.MaxDoc;
            BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea);

            TermEnum tenum = null;
            TermDocs tdoc = null;
            ITermValueList list = (listFactory == null ? (ITermValueList)new TermStringList() : listFactory.CreateTermList());
            List<int> minIDList = new List<int>();
            List<int> maxIDList = new List<int>();
            List<int> freqList = new List<int>();
            OpenBitSet bitset = new OpenBitSet();
            int negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName));
            int t = 0; // current term number
            list.Add(null);
            minIDList.Add(-1);
            maxIDList.Add(-1);
            freqList.Add(0);
            t++;

            _overflow = false;
            try
            {
                tdoc = reader.TermDocs();
                tenum = reader.Terms(new Term(fieldName, ""));
                if (tenum != null)
                {
                    do
                    {
                        Term term = tenum.Term;
                        if (term == null || !fieldName.Equals(term.Field))
                            break;

                        string val = term.Text;

                        if (val != null)
                        {
                            list.Add(val);

                            tdoc.Seek(tenum);
                            //freqList.add(tenum.docFreq()); // removed because the df doesn't take into account the num of deletedDocs
                            int df = 0;
                            int minID = -1;
                            int maxID = -1;
                            int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
                            if (tdoc.Next())
                            {
                                df++;
                                int docid = tdoc.Doc;

                                if (!loader.Add(docid, valId))
                                    LogOverflow(fieldName);
                                minID = docid;
                                bitset.Set(docid);
                                while (tdoc.Next())
                                {
                                    df++;
                                    docid = tdoc.Doc;

                                    if (!loader.Add(docid, valId))
                                        LogOverflow(fieldName);
                                    bitset.Set(docid);
                                }
                                maxID = docid;
                            }
                            freqList.Add(df);
                            minIDList.Add(minID);
                            maxIDList.Add(maxID);
                        }

                        t++;
                    }
                    while (tenum.Next());
                }
            }
            finally
            {
                try
                {
                    if (tdoc != null)
                    {
                        tdoc.Dispose();
                    }
                }
                finally
                {
                    if (tenum != null)
                    {
                        tenum.Dispose();
                    }
                }
            }

            list.Seal();

            try
            {
                _nestedArray.Load(maxdoc + 1, loader);
            }
            catch (System.IO.IOException e)
            {
                throw e;
            }
            catch (Exception e)
            {
                throw new RuntimeException("failed to load due to " + e.ToString(), e);
            }

            this.valArray = list;
            this.freqs = freqList.ToArray();
            this.minIDs = minIDList.ToArray();
            this.maxIDs = maxIDList.ToArray();

            int doc = 0;
            while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true))
            {
                ++doc;
            }
            if (doc <= maxdoc)
            {
                this.minIDs[0] = doc;
                doc = maxdoc;
                while (doc > 0 && !_nestedArray.Contains(doc, 0, true))
                {
                    --doc;
                }
                if (doc > 0)
                {
                    this.maxIDs[0] = doc;
                }
            }
            this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality();
        }
		// There are two ways we can determine the max_results
		// most recent items:  
		//
		// One is to instantiate Lucene documents for each of
		// the document IDs in primary_matches.  This is a
		// fairly expensive operation.
		//
		// The other is to walk through the list of all
		// document IDs in descending time order.  This is
		// a less expensive operation, but adds up over time
		// on large data sets.
		//
		// We can walk about 2.5 docs for every Document we
		// instantiate.  So what we'll do, if we have more
		// matches than available hits, is walk (m * 1.25)
		// docs to see if we can fill out the top 100 hits.
		// If not, we'll fall back to creating documents
		// for all of them.

		private static ArrayList ScanRecentDocs (IndexReader	    primary_reader,
						    IndexReader		    secondary_reader,
						    BetterBitArray	    primary_matches,
						    Dictionary<int, Hit>    hits_by_id,
						    int			    max_results,
						    ref int		    total_number_of_matches,
						    HitFilter		    hit_filter,
						    string		    index_name)
		{
			Stopwatch a = new Stopwatch ();
			a.Start ();

			TermDocs docs = primary_reader.TermDocs ();
			TermEnum enumerator = primary_reader.Terms (new Term ("InvertedTimestamp", String.Empty));
			ArrayList results = new ArrayList (max_results);
			int docs_found = 0;
			int docs_walked = 0;
			int hit_filter_removed = 0;
			int max_docs = (int) (primary_matches.TrueCount * 1.25);

			Term term;
			TermDocs secondary_term_docs = null;
			if (secondary_reader != null)
				secondary_term_docs = secondary_reader.TermDocs ();

			do {
				term = enumerator.Term ();
			
				if (term.Field () != "InvertedTimestamp")
					break;

				docs.Seek (enumerator);

				while (docs.Next ()
				       && docs_found < max_results
				       && docs_walked < max_docs) {
					int doc_id = docs.Doc ();

					if (primary_matches.Get (doc_id)) {
						Document doc = primary_reader.Document (doc_id);
						Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs);

						// If we have a HitFilter, apply it.
						if (hit_filter != null && ! hit_filter (hit)) {
							if (Debug)
								Log.Debug ("Filtered out {0}", hit.Uri);
							hit_filter_removed ++;
							continue;
						}
						hits_by_id [doc_id] = hit;
						// Add the result, last modified first
						results.Add (hit);
						docs_found++;
					}
			
					docs_walked++;
				}
			} while (enumerator.Next ()
				 && docs_found < max_results
				 && docs_walked < max_docs);

			docs.Close ();
			if (secondary_term_docs != null)
				secondary_term_docs.Close ();

			// If we've found all the docs we can return in a subset!
			// Fantastic, we've probably short circuited a slow search.
			if (docs_found != max_results) {
				// Otherwise bad luck! Not all docs found
				// Start afresh - this time traversing all results
				results = null;
			} else {
				// Adjust total_number_of_matches. We need to do this to avoid scenarios like the following:
				// max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned.
				// We want to avoid saying "Showing top 70 of 100". Note that since we are not passing
				// every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the
				// 1234 could actually be much less. But since max_hits was 100, that will not mislead the user.
				total_number_of_matches -= hit_filter_removed;
			}

			a.Stop ();
			if (Debug) {
				Log.Debug (">>> {0}: Walked {1} items, populated an enum with {2} items in {3}", index_name, docs_walked, docs_found, a);
				
				if (docs_found == max_results)
					Log.Debug (">>> {0}: Successfully short circuited timestamp ordering!", index_name);
			}

			return results;
		}
Exemple #28
0
        private OpenBitSet FastBits(IndexReader reader)
        {

            OpenBitSet bits = new OpenBitSet(reader.MaxDoc());
            bits.Set(0, reader.MaxDoc()); //assume all are valid
            Term startTerm = new Term(fieldName);
            TermEnum te = reader.Terms(startTerm);
            if (te != null)
            {
                Term currTerm = te.Term();

                while ((currTerm != null) && (currTerm.Field() == startTerm.Field())) //term fieldnames are interned
                {
                    if (te.DocFreq() > 1)
                    {
                        int lastDoc = -1;
                        //unset potential duplicates
                        TermDocs td = reader.TermDocs(currTerm);
                        td.Next();
                        if (keepMode == KM_USE_FIRST_OCCURRENCE)
                        {
                            td.Next();
                        }
                        do
                        {
                            lastDoc = td.Doc();
                            bits.Clear(lastDoc);
                        } while (td.Next());
                        if (keepMode == KM_USE_LAST_OCCURRENCE)
                        {
                            //restore the last bit
                            bits.Set(lastDoc);
                        }
                    }
                    if (!te.Next())
                    {
                        break;
                    }
                    currTerm = te.Term();
                }
            }
            return bits;
        }
        public static void  VerifyEquals(IndexReader r1, IndexReader r2, System.String idField)
        {
            Assert.AreEqual(r1.NumDocs(), r2.NumDocs());
            bool hasDeletes = !(r1.MaxDoc() == r2.MaxDoc() && r1.NumDocs() == r1.MaxDoc());

            int[] r2r1 = new int[r2.MaxDoc()];             // r2 id to r1 id mapping

            TermDocs termDocs1 = r1.TermDocs();
            TermDocs termDocs2 = r2.TermDocs();

            // create mapping from id2 space to id2 based on idField
            idField = StringHelper.Intern(idField);
            TermEnum termEnum = r1.Terms(new Term(idField, ""));

            do
            {
                Term term = termEnum.Term();
                if (term == null || (System.Object)term.Field() != (System.Object)idField)
                {
                    break;
                }

                termDocs1.Seek(termEnum);
                if (!termDocs1.Next())
                {
                    // This doc is deleted and wasn't replaced
                    termDocs2.Seek(termEnum);
                    Assert.IsFalse(termDocs2.Next());
                    continue;
                }

                int id1 = termDocs1.Doc();
                Assert.IsFalse(termDocs1.Next());

                termDocs2.Seek(termEnum);
                Assert.IsTrue(termDocs2.Next());
                int id2 = termDocs2.Doc();
                Assert.IsFalse(termDocs2.Next());

                r2r1[id2] = id1;

                // verify stored fields are equivalent
                try
                {
                    VerifyEquals(r1.Document(id1), r2.Document(id2));
                }
                catch (System.Exception t)
                {
                    System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term);
                    System.Console.Out.WriteLine("  d1=" + r1.Document(id1));
                    System.Console.Out.WriteLine("  d2=" + r2.Document(id2));
                    throw t;
                }

                try
                {
                    // verify term vectors are equivalent
                    VerifyEquals(r1.GetTermFreqVectors(id1), r2.GetTermFreqVectors(id2));
                }
                catch (System.Exception e)
                {
                    System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2);
                    TermFreqVector[] tv1 = r1.GetTermFreqVectors(id1);
                    System.Console.Out.WriteLine("  d1=" + tv1);
                    if (tv1 != null)
                    {
                        for (int i = 0; i < tv1.Length; i++)
                        {
                            System.Console.Out.WriteLine("    " + i + ": " + tv1[i]);
                        }
                    }

                    TermFreqVector[] tv2 = r2.GetTermFreqVectors(id2);
                    System.Console.Out.WriteLine("  d2=" + tv2);
                    if (tv2 != null)
                    {
                        for (int i = 0; i < tv2.Length; i++)
                        {
                            System.Console.Out.WriteLine("    " + i + ": " + tv2[i]);
                        }
                    }

                    throw e;
                }
            }while (termEnum.Next());

            termEnum.Close();

            // Verify postings
            TermEnum termEnum1 = r1.Terms(new Term("", ""));
            TermEnum termEnum2 = r2.Terms(new Term("", ""));

            // pack both doc and freq into single element for easy sorting
            long[] info1 = new long[r1.NumDocs()];
            long[] info2 = new long[r2.NumDocs()];

            for (; ;)
            {
                Term term1, term2;

                // iterate until we get some docs
                int len1;
                for (; ;)
                {
                    len1  = 0;
                    term1 = termEnum1.Term();
                    if (term1 == null)
                    {
                        break;
                    }
                    termDocs1.Seek(termEnum1);
                    while (termDocs1.Next())
                    {
                        int d1 = termDocs1.Doc();
                        int f1 = termDocs1.Freq();
                        info1[len1] = (((long)d1) << 32) | f1;
                        len1++;
                    }
                    if (len1 > 0)
                    {
                        break;
                    }
                    if (!termEnum1.Next())
                    {
                        break;
                    }
                }

                // iterate until we get some docs
                int len2;
                for (; ;)
                {
                    len2  = 0;
                    term2 = termEnum2.Term();
                    if (term2 == null)
                    {
                        break;
                    }
                    termDocs2.Seek(termEnum2);
                    while (termDocs2.Next())
                    {
                        int d2 = termDocs2.Doc();
                        int f2 = termDocs2.Freq();
                        info2[len2] = (((long)r2r1[d2]) << 32) | f2;
                        len2++;
                    }
                    if (len2 > 0)
                    {
                        break;
                    }
                    if (!termEnum2.Next())
                    {
                        break;
                    }
                }

                if (!hasDeletes)
                {
                    Assert.AreEqual(termEnum1.DocFreq(), termEnum2.DocFreq());
                }

                Assert.AreEqual(len1, len2);
                if (len1 == 0)
                {
                    break;                     // no more terms
                }
                Assert.AreEqual(term1, term2);

                // sort info2 to get it into ascending docid
                System.Array.Sort(info2, 0, len2 - 0);

                // now compare
                for (int i = 0; i < len1; i++)
                {
                    Assert.AreEqual(info1[i], info2[i]);
                }

                termEnum1.Next();
                termEnum2.Next();
            }
        }
 public TermsEnumCompatibility(IndexReader reader, String fieldName)
 {
     this.reader = reader;
     this.fieldName = string.Intern(fieldName);
     this.termEnum = reader.Terms(new Term(this.fieldName));
 }
Exemple #31
0
		public static void  AssertIndexEquals(IndexReader index1, IndexReader index2)
		{
			Assert.AreEqual(index1.NumDocs(), index2.NumDocs(), "IndexReaders have different values for numDocs.");
			Assert.AreEqual(index1.MaxDoc, index2.MaxDoc, "IndexReaders have different values for maxDoc.");
			Assert.AreEqual(index1.HasDeletions, index2.HasDeletions, "Only one IndexReader has deletions.");
			Assert.AreEqual(index1.IsOptimized(), index2.IsOptimized(), "Only one index is optimized.");
			
			// check field names
			System.Collections.Generic.ICollection<string> fieldsNames1 = index1.GetFieldNames(FieldOption.ALL);
			System.Collections.Generic.ICollection<string> fieldsNames2 = index1.GetFieldNames(FieldOption.ALL);

            System.Collections.Generic.ICollection<IFieldable> fields1 = null;
            System.Collections.Generic.ICollection<IFieldable> fields2 = null;

            Assert.AreEqual(fieldsNames1.Count, fieldsNames2.Count, "IndexReaders have different numbers of fields.");
            System.Collections.IEnumerator it1 = fieldsNames1.GetEnumerator();
            System.Collections.IEnumerator it2 = fieldsNames2.GetEnumerator();
			while (it1.MoveNext() && it2.MoveNext())
			{
				Assert.AreEqual((System.String) it1.Current, (System.String) it2.Current, "Different field names.");
			}
			
			// check norms
            it1 = fieldsNames1.GetEnumerator();
			while (it1.MoveNext())
			{
				System.String curField = (System.String) it1.Current;
				byte[] norms1 = index1.Norms(curField);
				byte[] norms2 = index2.Norms(curField);
				if (norms1 != null && norms2 != null)
				{
					Assert.AreEqual(norms1.Length, norms2.Length);
					for (int i = 0; i < norms1.Length; i++)
					{
						Assert.AreEqual(norms1[i], norms2[i], "Norm different for doc " + i + " and field '" + curField + "'.");
					}
				}
				else
				{
					Assert.AreSame(norms1, norms2);
				}
			}
			
			// check deletions
			for (int i = 0; i < index1.MaxDoc; i++)
			{
				Assert.AreEqual(index1.IsDeleted(i), index2.IsDeleted(i), "Doc " + i + " only deleted in one index.");
			}
			
			// check stored fields
			for (int i = 0; i < index1.MaxDoc; i++)
			{
				if (!index1.IsDeleted(i))
				{
					Document doc1 = index1.Document(i);
					Document doc2 = index2.Document(i);
					fields1 = doc1.GetFields();
					fields2 = doc2.GetFields();
					Assert.AreEqual(fields1.Count, fields2.Count, "Different numbers of fields for doc " + i + ".");
					it1 = fields1.GetEnumerator();
					it2 = fields2.GetEnumerator();
					while (it1.MoveNext() && it2.MoveNext())
					{
						Field curField1 = (Field) it1.Current;
						Field curField2 = (Field) it2.Current;
						Assert.AreEqual(curField1.Name, curField2.Name, "Different fields names for doc " + i + ".");
						Assert.AreEqual(curField1.StringValue, curField2.StringValue, "Different field values for doc " + i + ".");
					}
				}
			}
			
			// check dictionary and posting lists
			TermEnum enum1 = index1.Terms();
			TermEnum enum2 = index2.Terms();
			TermPositions tp1 = index1.TermPositions();
			TermPositions tp2 = index2.TermPositions();
			while (enum1.Next())
			{
				Assert.IsTrue(enum2.Next());
				Assert.AreEqual(enum1.Term, enum2.Term, "Different term in dictionary.");
				tp1.Seek(enum1.Term);
				tp2.Seek(enum1.Term);
				while (tp1.Next())
				{
					Assert.IsTrue(tp2.Next());
					Assert.AreEqual(tp1.Doc, tp2.Doc, "Different doc id in postinglist of term " + enum1.Term + ".");
					Assert.AreEqual(tp1.Freq, tp2.Freq, "Different term frequence in postinglist of term " + enum1.Term + ".");
					for (int i = 0; i < tp1.Freq; i++)
					{
						Assert.AreEqual(tp1.NextPosition(), tp2.NextPosition(), "Different positions in postinglist of term " + enum1.Term + ".");
					}
				}
			}
		}
Exemple #32
0
 public override TermEnum Terms()
 {
     EnsureOpen();
     return(in_Renamed.Terms());
 }
        /*
           * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
           *
           * @param reader     The {@link IndexReader} which will be consulted to identify potential stop words that
           *                   exceed the required document frequency
           * @param fieldName  The field for which stopwords will be added
           * @param maxDocFreq The maximum number of index documents which
           *                   can contain a term, after which the term is considered to be a stop word.
           * @return The number of stop words identified.
           * @throws IOException
           */
        public int AddStopWords(IndexReader reader, String fieldName, int maxDocFreq)
        {
            var stopWords = Support.Compatibility.SetFactory.CreateHashSet<string>();
            String internedFieldName = StringHelper.Intern(fieldName);
            TermEnum te = reader.Terms(new Term(fieldName));
            Term term = te.Term;
            while (term != null) {
              if (term.Field != internedFieldName) {
            break;
              }
              if (te.DocFreq() > maxDocFreq) {
            stopWords.Add(term.Text);
              }
              if (!te.Next()) {
            break;
              }
              term = te.Term;
            }
            stopWordsPerField.Add(fieldName, stopWords);

            /* if the stopwords for a field are changed,
             * then saved streams for that field are erased.
             */
            IDictionary<String,SavedStreams> streamMap = (IDictionary<String,SavedStreams>) PreviousTokenStream;
            if (streamMap != null)
              streamMap.Remove(fieldName);

            return stopWords.Count;
        }
        public override void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea)
        {
            long t0 = System.Environment.TickCount;
            int maxdoc = reader.MaxDoc;
            BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea);
            BigNestedIntArray.BufferedLoader weightLoader = GetBufferedLoader(maxdoc, null);

            TermEnum tenum = null;
            TermDocs tdoc = null;
            var list = (listFactory == null ? new TermStringList() : listFactory.CreateTermList());
            List<int> minIDList = new List<int>();
            List<int> maxIDList = new List<int>();
            List<int> freqList = new List<int>();
            OpenBitSet bitset = new OpenBitSet(maxdoc + 1);
            int negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName));
            int t = 0; // current term number
            list.Add(null);
            minIDList.Add(-1);
            maxIDList.Add(-1);
            freqList.Add(0);
            t++;

            _overflow = false;

            string pre = null;

            int df = 0;
            int minID = -1;
            int maxID = -1;
            int valId = 0;

            try
            {
                tdoc = reader.TermDocs();
                tenum = reader.Terms(new Term(fieldName, ""));
                if (tenum != null)
                {
                    do
                    {
                        Term term = tenum.Term;
                        if (term == null || !fieldName.Equals(term.Field))
                            break;

                        string val = term.Text;

                        if (val != null)
                        {
                            int weight = 0;
                            string[] split = val.Split(new char[] { '\0' }, StringSplitOptions.RemoveEmptyEntries);
                            if (split.Length > 1)
                            {
                                val = split[0];
                                weight = int.Parse(split[split.Length - 1]);
                            }
                            if (pre == null || !val.Equals(pre))
                            {
                                if (pre != null)
                                {
                                    freqList.Add(df);
                                    minIDList.Add(minID);
                                    maxIDList.Add(maxID);
                                }

                                list.Add(val);

                                df = 0;
                                minID = -1;
                                maxID = -1;
                                valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
                                t++;
                            }

                            tdoc.Seek(tenum);
                            if (tdoc.Next())
                            {
                                df++;
                                int docid = tdoc.Doc;

                                if (!loader.Add(docid, valId)) LogOverflow(fieldName);
                                else weightLoader.Add(docid, weight);

                                if (docid < minID) minID = docid;
                                bitset.FastSet(docid);
                                while (tdoc.Next())
                                {
                                    df++;
                                    docid = tdoc.Doc;

                                    if (!loader.Add(docid, valId)) LogOverflow(fieldName);
                                    else weightLoader.Add(docid, weight);

                                    bitset.FastSet(docid);
                                }
                                if (docid > maxID) maxID = docid;
                            }
                            pre = val;
                        }

                    }
                    while (tenum.Next());
                    if (pre != null)
                    {
                        freqList.Add(df);
                        minIDList.Add(minID);
                        maxIDList.Add(maxID);
                    }
                }
            }
            finally
            {
                try
                {
                    if (tdoc != null)
                    {
                        tdoc.Dispose();
                    }
                }
                finally
                {
                    if (tenum != null)
                    {
                        tenum.Dispose();
                    }
                }
            }

            list.Seal();

            try
            {
                _nestedArray.Load(maxdoc + 1, loader);
                _weightArray.Load(maxdoc + 1, weightLoader);
            }
            catch (System.IO.IOException e)
            {
                throw e;
            }
            catch (Exception e)
            {
                throw new RuntimeException("failed to load due to " + e.ToString(), e);
            }

            this.valArray = list;
            this.freqs = freqList.ToArray();
            this.minIDs = minIDList.ToArray();
            this.maxIDs = maxIDList.ToArray();

            int doc = 0;
            while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true))
            {
                ++doc;
            }
            if (doc <= maxdoc)
            {
                this.minIDs[0] = doc;
                doc = maxdoc;
                while (doc > 0 && !_nestedArray.Contains(doc, 0, true))
                {
                    --doc;
                }
                if (doc > 0)
                {
                    this.maxIDs[0] = doc;
                }
            }
            this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality();   
        }
        private static void FillCache(IndexSearcherHolder.IndexSearcherHoldingState state, List<string> fieldsToRead,IndexReader reader)
        {
            foreach (var field in fieldsToRead)
            {
                using (var termDocs = reader.TermDocs())
                {
                    using (var termEnum = reader.Terms(new Term(field)))
                    {
                        do
                        {
                            if (termEnum.Term == null || field != termEnum.Term.Field)
                                break;

                            if (LowPrecisionNumber(termEnum.Term))
                                continue;


                            var totalDocCountIncludedDeletes = termEnum.DocFreq();
                            termDocs.Seek(termEnum.Term);

                            while (termDocs.Next() && totalDocCountIncludedDeletes > 0)
                            {
                                totalDocCountIncludedDeletes -= 1;
                                if (reader.IsDeleted(termDocs.Doc))
                                    continue;

                                state.SetInCache(field, termDocs.Doc, termEnum.Term);
                            }
                        } while (termEnum.Next());
                    }
                }
            }
        }
        public virtual void  TestPhrasePrefix()
        {
            RAMDirectory indexStore = new RAMDirectory();
            IndexWriter  writer     = new IndexWriter(indexStore, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED, null);

            Add("blueberry pie", writer);
            Add("blueberry strudel", writer);
            Add("blueberry pizza", writer);
            Add("blueberry chewing gum", writer);
            Add("bluebird pizza", writer);
            Add("bluebird foobar pizza", writer);
            Add("piccadilly circus", writer);
            writer.Optimize(null);
            writer.Close();

            IndexSearcher searcher = new IndexSearcher(indexStore, true, null);

            // search for "blueberry pi*":
            MultiPhraseQuery query1 = new MultiPhraseQuery();
            // search for "strawberry pi*":
            MultiPhraseQuery query2 = new MultiPhraseQuery();

            query1.Add(new Term("body", "blueberry"));
            query2.Add(new Term("body", "strawberry"));

            System.Collections.ArrayList termsWithPrefix = new System.Collections.ArrayList();
            IndexReader ir = IndexReader.Open((Directory)indexStore, true, null);

            // this TermEnum gives "piccadilly", "pie" and "pizza".
            System.String prefix = "pi";
            TermEnum      te     = ir.Terms(new Term("body", prefix), null);

            do
            {
                if (te.Term.Text.StartsWith(prefix))
                {
                    termsWithPrefix.Add(te.Term);
                }
            }while (te.Next(null));

            query1.Add((Term[])termsWithPrefix.ToArray(typeof(Term)));
            Assert.AreEqual("body:\"blueberry (piccadilly pie pizza)\"", query1.ToString());
            query2.Add((Term[])termsWithPrefix.ToArray(typeof(Term)));
            Assert.AreEqual("body:\"strawberry (piccadilly pie pizza)\"", query2.ToString());

            ScoreDoc[] result;
            result = searcher.Search(query1, null, 1000, null).ScoreDocs;
            Assert.AreEqual(2, result.Length);
            result = searcher.Search(query2, null, 1000, null).ScoreDocs;
            Assert.AreEqual(0, result.Length);

            // search for "blue* pizza":
            MultiPhraseQuery query3 = new MultiPhraseQuery();

            termsWithPrefix.Clear();
            prefix = "blue";
            te     = ir.Terms(new Term("body", prefix), null);
            do
            {
                if (te.Term.Text.StartsWith(prefix))
                {
                    termsWithPrefix.Add(te.Term);
                }
            }while (te.Next(null));
            query3.Add((Term[])termsWithPrefix.ToArray(typeof(Term)));
            query3.Add(new Term("body", "pizza"));

            result = searcher.Search(query3, null, 1000, null).ScoreDocs;
            Assert.AreEqual(2, result.Length);             // blueberry pizza, bluebird pizza
            Assert.AreEqual("body:\"(blueberry bluebird) pizza\"", query3.ToString());

            // test slop:
            query3.Slop = 1;
            result      = searcher.Search(query3, null, 1000, null).ScoreDocs;
            Assert.AreEqual(3, result.Length);             // blueberry pizza, bluebird pizza, bluebird foobar pizza

            MultiPhraseQuery query4 = new MultiPhraseQuery();

            // okay, all terms must belong to the same field
            Assert.Throws <ArgumentException>(() =>
            {
                query4.Add(new Term("field1", "foo"));
                query4.Add(new Term("field2", "foobar"));
            });

            searcher.Close();
            indexStore.Close();
        }
        public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory)
        {
            string field = string.Intern(fieldName);
            int maxDoc = reader.MaxDoc;

            if (orderArray == null) // we want to reuse the memory
            {
                orderArray = NewInstance(termCountSize, maxDoc);
            }
            else
            {
                orderArray.EnsureCapacity(maxDoc); // no need to fill to 0, we are reseting the data anyway
            }

            List<int> minIDList = new List<int>();
            List<int> maxIDList = new List<int>();
            List<int> freqList = new List<int>();

            int length = maxDoc + 1;
            ITermValueList list = listFactory == null ? new TermStringList() : listFactory.CreateTermList();
            TermDocs termDocs = reader.TermDocs();
            TermEnum termEnum = reader.Terms(new Term(field));
            int t = 0; // current term number

            list.Add(null);
            minIDList.Add(-1);
            maxIDList.Add(-1);
            freqList.Add(0);
            //int df = 0;
            t++;
            try
            {
                do
                {
                    Term term = termEnum.Term;
                    if (term == null || string.CompareOrdinal(term.Field, field) != 0)
                        break;

                    if (t >= orderArray.MaxValue())
                    {
                        throw new System.IO.IOException("maximum number of value cannot exceed: " + orderArray.MaxValue());
                    }
                    // Alexey: well, we could get now more than one term per document. Effectively, we could build facet againsts tokenized field
                    /*// we expect that there is at most one term per document
                    if (t >= length)
                    {
                        throw new RuntimeException("there are more terms than " + "documents in field \"" + field + "\", but it's impossible to sort on " + "tokenized fields");
                    }*/
                    // store term text
                    list.Add(term.Text);
                    termDocs.Seek(termEnum);
                    // freqList.add(termEnum.docFreq()); // doesn't take into account deldocs
                    int minID = -1;
                    int maxID = -1;
                    int df = 0;
                    if (termDocs.Next())
                    {
                        df++;
                        int docid = termDocs.Doc;
                        orderArray.Add(docid, t);
                        minID = docid;
                        while (termDocs.Next())
                        {
                            df++;
                            docid = termDocs.Doc;
                            orderArray.Add(docid, t);
                        }
                        maxID = docid;
                    }
                    freqList.Add(df);
                    minIDList.Add(minID);
                    maxIDList.Add(maxID);

                    t++;
                } while (termEnum.Next());
            }
            finally
            {
                termDocs.Dispose();
                termEnum.Dispose();
            }
            list.Seal();

            this.valArray = list;
            this.freqs = freqList.ToArray();
            this.minIDs = minIDList.ToArray();
            this.maxIDs = maxIDList.ToArray();
        }
Exemple #38
0
 public override TermEnum Terms()
 {
     return(in_Renamed.Terms());
 }