private void VerifyDocFreq() { IndexReader reader = IndexReader.Open(dir); TermEnum termEnum = null; // create enumeration of all terms termEnum = reader.Terms(); // go to the first term (aaa) termEnum.Next(); // assert that term is 'aaa' Assert.AreEqual("aaa", termEnum.Term().Text()); Assert.AreEqual(200, termEnum.DocFreq()); // go to the second term (bbb) termEnum.Next(); // assert that term is 'bbb' Assert.AreEqual("bbb", termEnum.Term().Text()); Assert.AreEqual(100, termEnum.DocFreq()); termEnum.Close(); // create enumeration of terms after term 'aaa', including 'aaa' termEnum = reader.Terms(new Term("content", "aaa")); // assert that term is 'aaa' Assert.AreEqual("aaa", termEnum.Term().Text()); Assert.AreEqual(200, termEnum.DocFreq()); // go to term 'bbb' termEnum.Next(); // assert that term is 'bbb' Assert.AreEqual("bbb", termEnum.Term().Text()); Assert.AreEqual(100, termEnum.DocFreq()); termEnum.Close(); }
public MultiTermEnum(IndexReader[] readers, int[] starts, Term t) { queue = new SegmentMergeQueue(readers.Length); for (int i = 0; i < readers.Length; i++) { IndexReader reader = readers[i]; TermEnum termEnum; if (t != null) { termEnum = reader.Terms(t); } else { termEnum = reader.Terms(); } SegmentMergeInfo smi = new SegmentMergeInfo(starts[i], termEnum, reader); if (t == null?smi.Next():termEnum.Term() != null) { queue.Put(smi); } // initialize queue else { smi.Close(); } } if (t != null && queue.Size() > 0) { Next(); } }
public virtual int doTest(int iter, int ndocs, int maxTF, float percentDocs) { Directory dir = new RAMDirectory(); long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); AddDocs(dir, ndocs, "foo", "val", maxTF, percentDocs); long end = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("milliseconds for creation of " + ndocs + " docs = " + (end - start)); IndexReader reader = IndexReader.Open(dir); TermEnum tenum = reader.Terms(new Term("foo", "val")); TermDocs tdocs = reader.TermDocs(); start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); int ret = 0; for (int i = 0; i < iter; i++) { tdocs.Seek(tenum); while (tdocs.Next()) { ret += tdocs.Doc(); } } end = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("milliseconds for " + iter + " TermDocs iteration: " + (end - start)); return(ret); }
protected internal override string[] CreateValue(IndexReader reader, Entry entryKey, IState state) { System.String field = StringHelper.Intern(entryKey.field); System.String[] retArray = new System.String[reader.MaxDoc]; TermDocs termDocs = reader.TermDocs(state); TermEnum termEnum = reader.Terms(new Term(field), state); try { do { Term term = termEnum.Term; if (term == null || (System.Object)term.Field != (System.Object)field) { break; } System.String termval = term.Text; termDocs.Seek(termEnum, state); while (termDocs.Next(state)) { retArray[termDocs.Doc] = termval; } }while (termEnum.Next(state)); } finally { termDocs.Close(); termEnum.Close(); } return(retArray); }
protected internal override float[] CreateValue(IndexReader reader, Entry entryKey, IState state) { Entry entry = entryKey; System.String field = entry.field; FloatParser parser = (FloatParser)entry.custom; if (parser == null) { try { return(wrapper.GetFloats(reader, field, Lucene.Net.Search.FieldCache_Fields.DEFAULT_FLOAT_PARSER, state)); } catch (System.FormatException) { return(wrapper.GetFloats(reader, field, Lucene.Net.Search.FieldCache_Fields.NUMERIC_UTILS_FLOAT_PARSER, state)); } } float[] retArray = null; TermDocs termDocs = reader.TermDocs(state); TermEnum termEnum = reader.Terms(new Term(field), state); try { do { Term term = termEnum.Term; if (term == null || (System.Object)term.Field != (System.Object)field) { break; } float termval = parser.ParseFloat(term.Text); if (retArray == null) { // late init retArray = new float[reader.MaxDoc]; } termDocs.Seek(termEnum, state); while (termDocs.Next(state)) { retArray[termDocs.Doc] = termval; } }while (termEnum.Next(state)); } catch (StopFillCacheException) { } finally { termDocs.Close(); termEnum.Close(); } if (retArray == null) { // no values retArray = new float[reader.MaxDoc]; } return(retArray); }
public virtual void TestPhrasePrefix() { RAMDirectory indexStore = new RAMDirectory(); IndexWriter writer = new IndexWriter(indexStore, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED, null); Document doc1 = new Document(); Document doc2 = new Document(); Document doc3 = new Document(); Document doc4 = new Document(); Document doc5 = new Document(); doc1.Add(new Field("body", "blueberry pie", Field.Store.YES, Field.Index.ANALYZED)); doc2.Add(new Field("body", "blueberry strudel", Field.Store.YES, Field.Index.ANALYZED)); doc3.Add(new Field("body", "blueberry pizza", Field.Store.YES, Field.Index.ANALYZED)); doc4.Add(new Field("body", "blueberry chewing gum", Field.Store.YES, Field.Index.ANALYZED)); doc5.Add(new Field("body", "piccadilly circus", Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc1, null); writer.AddDocument(doc2, null); writer.AddDocument(doc3, null); writer.AddDocument(doc4, null); writer.AddDocument(doc5, null); writer.Optimize(null); writer.Close(); IndexSearcher searcher = new IndexSearcher(indexStore, true, null); //PhrasePrefixQuery query1 = new PhrasePrefixQuery(); MultiPhraseQuery query1 = new MultiPhraseQuery(); //PhrasePrefixQuery query2 = new PhrasePrefixQuery(); MultiPhraseQuery query2 = new MultiPhraseQuery(); query1.Add(new Term("body", "blueberry")); query2.Add(new Term("body", "strawberry")); System.Collections.ArrayList termsWithPrefix = new System.Collections.ArrayList(); IndexReader ir = IndexReader.Open((Directory)indexStore, true, null); // this TermEnum gives "piccadilly", "pie" and "pizza". System.String prefix = "pi"; TermEnum te = ir.Terms(new Term("body", prefix + "*"), null); do { if (te.Term.Text.StartsWith(prefix)) { termsWithPrefix.Add(te.Term); } }while (te.Next(null)); query1.Add((Term[])termsWithPrefix.ToArray(typeof(Term))); query2.Add((Term[])termsWithPrefix.ToArray(typeof(Term))); ScoreDoc[] result; result = searcher.Search(query1, null, 1000, null).ScoreDocs; Assert.AreEqual(2, result.Length); result = searcher.Search(query2, null, 1000, null).ScoreDocs; Assert.AreEqual(0, result.Length); }
private void MergeTermInfos() { int base_Renamed = 0; for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; TermEnum termEnum = reader.Terms(); SegmentMergeInfo smi = new SegmentMergeInfo(base_Renamed, termEnum, reader); base_Renamed += reader.NumDocs(); if (smi.Next()) { queue.Put(smi); } // initialize queue else { smi.Close(); } } SegmentMergeInfo[] match = new SegmentMergeInfo[readers.Count]; while (queue.Size() > 0) { int matchSize = 0; // pop matching terms match[matchSize++] = (SegmentMergeInfo)queue.Pop(); Term term = match[0].term; SegmentMergeInfo top = (SegmentMergeInfo)queue.Top(); while (top != null && term.CompareTo(top.term) == 0) { match[matchSize++] = (SegmentMergeInfo)queue.Pop(); top = (SegmentMergeInfo)queue.Top(); } int df = MergeTermInfo(match, matchSize); // add new TermInfo if (checkAbort != null) { checkAbort.Work(df / 3.0); } while (matchSize > 0) { SegmentMergeInfo smi = match[--matchSize]; if (smi.Next()) { queue.Put(smi); } // restore queue else { smi.Close(); // done with a segment } } } }
/// <summary> /// Initialization method called by subclasses to simulate a shared /// base constructor as generic classes cannot have a parameterized ctor. /// </summary> /// <param name="reader">The index reader to read from.</param> /// <param name="fieldName">The field to enumerate.</param> /// <param name="includeDocs">Whether this enumerator will support TermDocs.</param> protected void Init(IndexReader reader, string fieldName, bool includeDocs) { this.termEnum = reader.Terms(new Term(fieldName)); if (includeDocs) { this.termDocs = reader.TermDocs(); this.tdEnum = new TermDocEnumerator.TermDocUsingTermsEnumerator(this.termDocs, this.termEnum); } this.tEnum = new TermEnumerator(termEnum, termDocs, fieldName, this); }
public ParallelTermEnum(ParallelReader enclosingInstance, Term term, IState state) { InitBlock(enclosingInstance); field = term.Field; IndexReader reader = Enclosing_Instance.fieldToReader[field]; if (reader != null) { termEnum = reader.Terms(term, state); } }
public ParallelTermEnum(ParallelReader enclosingInstance, Term term) { InitBlock(enclosingInstance); field = term.Field(); IndexReader reader = ((IndexReader)Enclosing_Instance.fieldToReader[field]); if (reader != null) { termEnum = reader.Terms(term); } }
public RegexTermEnum(IndexReader reader, Term term, IRegexCapabilities regexImpl) { _sField = term.Field; string sText = term.Text; _regexImpl = regexImpl; _regexImpl.Compile(sText); _sPre = _regexImpl.Prefix() ?? ""; SetEnum(reader.Terms(new Term(term.Field, _sPre))); }
/// <summary>Increments the enumeration to the next element. True if one exists. </summary> //@Override public override bool Next() { // if a current term exists, the actual enum is initialized: // try change to next term, if no such term exists, fall-through if (currentTerm != null) { System.Diagnostics.Debug.Assert(actualEnum != null); if (actualEnum.Next()) { currentTerm = actualEnum.Term; if (TermCompare(currentTerm)) { return(true); } } } // if all above fails, we go forward to the next enum, // if one is available currentTerm = null; while (rangeBounds.Count >= 2) { // close the current enum and read next bounds if (actualEnum != null) { actualEnum.Close(); actualEnum = null; } string lowerBound = rangeBounds.First.Value; rangeBounds.RemoveFirst(); this.currentUpperBound = rangeBounds.First.Value; rangeBounds.RemoveFirst(); // create a new enum actualEnum = reader.Terms(termTemplate.CreateTerm(lowerBound)); currentTerm = actualEnum.Term; if (currentTerm != null && TermCompare(currentTerm)) { return(true); } // clear the current term for next iteration currentTerm = null; } // no more sub-range enums available System.Diagnostics.Debug.Assert(rangeBounds.Count == 0 && currentTerm == null); return(false); }
public virtual void TestThreadSafety() { rnd = NewRandom(); int numThreads = 5; int numDocs = 50; ByteArrayPool pool = new ByteArrayPool(numThreads, 5); Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED, null); System.String field = "test"; ThreadClass[] ingesters = new ThreadClass[numThreads]; for (int i = 0; i < numThreads; i++) { ingesters[i] = new AnonymousClassThread(numDocs, field, pool, writer, this); ingesters[i].Start(); } for (int i = 0; i < numThreads; i++) { ingesters[i].Join(); } writer.Close(); IndexReader reader = IndexReader.Open(dir, true, null); TermEnum terms = reader.Terms(null); while (terms.Next(null)) { TermPositions tp = reader.TermPositions(terms.Term, null); while (tp.Next(null)) { int freq = tp.Freq; for (int i = 0; i < freq; i++) { tp.NextPosition(null); Assert.AreEqual(pool.BytesToString(tp.GetPayload(new byte[5], 0, null)), terms.Term.Text); } } tp.Close(); } terms.Close(); reader.Close(); Assert.AreEqual(pool.Size(), numThreads); }
private IEnumerable<string> GetFieldValues(IndexReader reader, string groupByField) { TermEnum te = reader.Terms(new Term(groupByField, string.Empty)); if (te.Term() == null || te.Term().Field() != groupByField) return Enumerable.Empty<string>(); var list = new List<string>(); list.Add(te.Term().Text()); while (te.Next()) { if (te.Term().Field() != groupByField) break; list.Add(te.Term().Text()); } return list; }
protected internal override short[] CreateValue(IndexReader reader, Entry entryKey, IState state) { Entry entry = entryKey; System.String field = entry.field; ShortParser parser = (ShortParser)entry.custom; if (parser == null) { return(wrapper.GetShorts(reader, field, Lucene.Net.Search.FieldCache_Fields.DEFAULT_SHORT_PARSER, state)); } short[] retArray = new short[reader.MaxDoc]; TermDocs termDocs = reader.TermDocs(state); TermEnum termEnum = reader.Terms(new Term(field), state); try { do { Term term = termEnum.Term; if (term == null || (System.Object)term.Field != (System.Object)field) { break; } short termval = parser.ParseShort(term.Text); termDocs.Seek(termEnum, state); while (termDocs.Next(state)) { retArray[termDocs.Doc] = termval; } }while (termEnum.Next(state)); } catch (StopFillCacheException) { } finally { termDocs.Close(); termEnum.Close(); } return(retArray); }
private static Dictionary<string, int[]> FillCache(IndexReader reader, int docBase, string field) { using (var termDocs = reader.TermDocs()) { var items = new Dictionary<string, int[]>(); var docsForTerm = new List<int>(); using (var termEnum = reader.Terms(new Term(field))) { do { if (termEnum.Term == null || field != termEnum.Term.Field) break; Term term = termEnum.Term; if (LowPrecisionNumber(term.Field, term.Text)) continue; var totalDocCountIncludedDeletes = termEnum.DocFreq(); termDocs.Seek(termEnum.Term); while (termDocs.Next() && totalDocCountIncludedDeletes > 0) { var curDoc = termDocs.Doc; totalDocCountIncludedDeletes -= 1; if (reader.IsDeleted(curDoc)) continue; docsForTerm.Add(curDoc + docBase); } docsForTerm.Sort(); items[term.Text] = docsForTerm.ToArray(); docsForTerm.Clear(); } while (termEnum.Next()); } return items; } }
private OpenBitSet CorrectBits(IndexReader reader) { OpenBitSet bits = new OpenBitSet(reader.MaxDoc); //assume all are INvalid Term startTerm = new Term(fieldName); TermEnum te = reader.Terms(startTerm); if (te != null) { Term currTerm = te.Term; while ((currTerm != null) && (currTerm.Field == startTerm.Field)) //term fieldnames are interned { int lastDoc = -1; //set non duplicates TermDocs td = reader.TermDocs(currTerm); if (td.Next()) { if (keepMode == KM_USE_FIRST_OCCURRENCE) { bits.Set(td.Doc); } else { do { lastDoc = td.Doc; } while (td.Next()); bits.Set(lastDoc); } } if (!te.Next()) { break; } currTerm = te.Term; } } return bits; }
public static void VerifyEquals(IndexReader r1, IndexReader r2, System.String idField) { Assert.AreEqual(r1.NumDocs(), r2.NumDocs()); bool hasDeletes = !(r1.MaxDoc() == r2.MaxDoc() && r1.NumDocs() == r1.MaxDoc()); int[] r2r1 = new int[r2.MaxDoc()]; // r2 id to r1 id mapping TermDocs termDocs1 = r1.TermDocs(); TermDocs termDocs2 = r2.TermDocs(); // create mapping from id2 space to id2 based on idField idField = StringHelper.Intern(idField); TermEnum termEnum = r1.Terms(new Term(idField, "")); do { Term term = termEnum.Term(); if (term == null || (System.Object) term.Field() != (System.Object) idField) break; termDocs1.Seek(termEnum); if (!termDocs1.Next()) { // This doc is deleted and wasn't replaced termDocs2.Seek(termEnum); Assert.IsFalse(termDocs2.Next()); continue; } int id1 = termDocs1.Doc(); Assert.IsFalse(termDocs1.Next()); termDocs2.Seek(termEnum); Assert.IsTrue(termDocs2.Next()); int id2 = termDocs2.Doc(); Assert.IsFalse(termDocs2.Next()); r2r1[id2] = id1; // verify stored fields are equivalent try { VerifyEquals(r1.Document(id1), r2.Document(id2)); } catch (System.Exception t) { System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term); System.Console.Out.WriteLine(" d1=" + r1.Document(id1)); System.Console.Out.WriteLine(" d2=" + r2.Document(id2)); throw t; } try { // verify term vectors are equivalent VerifyEquals(r1.GetTermFreqVectors(id1), r2.GetTermFreqVectors(id2)); } catch (System.Exception e) { System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2); TermFreqVector[] tv1 = r1.GetTermFreqVectors(id1); System.Console.Out.WriteLine(" d1=" + tv1); if (tv1 != null) for (int i = 0; i < tv1.Length; i++) { System.Console.Out.WriteLine(" " + i + ": " + tv1[i]); } TermFreqVector[] tv2 = r2.GetTermFreqVectors(id2); System.Console.Out.WriteLine(" d2=" + tv2); if (tv2 != null) for (int i = 0; i < tv2.Length; i++) { System.Console.Out.WriteLine(" " + i + ": " + tv2[i]); } throw e; } } while (termEnum.Next()); termEnum.Close(); // Verify postings TermEnum termEnum1 = r1.Terms(new Term("", "")); TermEnum termEnum2 = r2.Terms(new Term("", "")); // pack both doc and freq into single element for easy sorting long[] info1 = new long[r1.NumDocs()]; long[] info2 = new long[r2.NumDocs()]; for (; ; ) { Term term1, term2; // iterate until we get some docs int len1; for (; ; ) { len1 = 0; term1 = termEnum1.Term(); if (term1 == null) break; termDocs1.Seek(termEnum1); while (termDocs1.Next()) { int d1 = termDocs1.Doc(); int f1 = termDocs1.Freq(); info1[len1] = (((long) d1) << 32) | f1; len1++; } if (len1 > 0) break; if (!termEnum1.Next()) break; } // iterate until we get some docs int len2; for (; ; ) { len2 = 0; term2 = termEnum2.Term(); if (term2 == null) break; termDocs2.Seek(termEnum2); while (termDocs2.Next()) { int d2 = termDocs2.Doc(); int f2 = termDocs2.Freq(); info2[len2] = (((long) r2r1[d2]) << 32) | f2; len2++; } if (len2 > 0) break; if (!termEnum2.Next()) break; } if (!hasDeletes) Assert.AreEqual(termEnum1.DocFreq(), termEnum2.DocFreq()); Assert.AreEqual(len1, len2); if (len1 == 0) break; // no more terms Assert.AreEqual(term1, term2); // sort info2 to get it into ascending docid System.Array.Sort(info2, 0, len2 - 0); // now compare for (int i = 0; i < len1; i++) { Assert.AreEqual(info1[i], info2[i]); } termEnum1.Next(); termEnum2.Next(); } }
/// <summary> /// Creates a new <c>SingleTermEnum</c>. /// <p/> /// After calling the constructor the enumeration is already pointing to the term, /// if it exists. /// </summary> public SingleTermEnum(IndexReader reader, Term singleTerm) { this.singleTerm = singleTerm; SetEnum(reader.Terms(singleTerm)); }
protected internal double[] ComputeDistances(IndexReader reader) { double[] retArray = null; var termDocs = reader.TermDocs(); var termEnum = reader.Terms(new Term(Constants.SpatialShapeFieldName)); try { do { Term term = termEnum.Term(); if (term == null) break; Debug.Assert(Constants.SpatialShapeFieldName.Equals(term.Field())); Shape termval; try { termval = SpatialIndex.RavenSpatialContext.ReadShape(term.Text()); // read shape } catch (InvalidShapeException) { continue; } var pt = termval as Point; if (pt == null) continue; var distance = SpatialIndex.RavenSpatialContext.GetDistCalc().Distance(pt, originPt); if (retArray == null) // late init retArray = new double[reader.MaxDoc()]; termDocs.Seek(termEnum); while (termDocs.Next()) { retArray[termDocs.Doc()] = distance; } } while (termEnum.Next()); } finally { termDocs.Close(); termEnum.Close(); } return retArray ?? new double[reader.MaxDoc()]; }
protected internal override StringIndex CreateValue(IndexReader reader, Entry entryKey, IState state) { System.String field = StringHelper.Intern(entryKey.field); int[] retArray = new int[reader.MaxDoc]; int[] retArrayOrdered = new int[reader.MaxDoc]; for (int i = 0; i < retArrayOrdered.Length; i++) { retArrayOrdered[i] = -1; } var length = reader.MaxDoc + 1; UnmanagedStringArray mterms = new UnmanagedStringArray(length); TermDocs termDocs = reader.TermDocs(state); SegmentTermEnum termEnum = (SegmentTermEnum)reader.Terms(new Term(field), state); int t = 0; // current term number int docIndex = 0; // an entry for documents that have no terms in this field // should a document with no terms be at top or bottom? // this puts them at the top - if it is changed, FieldDocSortedHitQueue // needs to change as well. t++; try { do { if (termEnum.termBuffer.Field != field || t >= length) { break; } // store term text mterms.Add(termEnum.termBuffer.TextAsSpan); termDocs.Seek(termEnum, state); while (termDocs.Next(state)) { var pt = retArray[termDocs.Doc]; retArray[termDocs.Doc] = t; if (pt == 0) { retArrayOrdered[docIndex++] = termDocs.Doc; } } t++; }while (termEnum.Next(state)); } finally { termDocs.Close(); termEnum.Close(); } StringIndex value_Renamed = new StringIndex(retArray, retArrayOrdered, mterms); return(value_Renamed); }
public override TermEnum Terms(IState state) { EnsureOpen(); return(in_Renamed.Terms(state)); }
private void MergeTermInfos(FormatPostingsFieldsConsumer consumer) { int base_Renamed = 0; int readerCount = readers.Count; for (int i = 0; i < readerCount; i++) { IndexReader reader = readers[i]; TermEnum termEnum = reader.Terms(); SegmentMergeInfo smi = new SegmentMergeInfo(base_Renamed, termEnum, reader); int[] docMap = smi.GetDocMap(); if (docMap != null) { if (docMaps == null) { docMaps = new int[readerCount][]; delCounts = new int[readerCount]; } docMaps[i] = docMap; delCounts[i] = smi.reader.MaxDoc - smi.reader.NumDocs(); } base_Renamed += reader.NumDocs(); System.Diagnostics.Debug.Assert(reader.NumDocs() == reader.MaxDoc - smi.delCount); if (smi.Next()) { queue.Add(smi); } // initialize queue else { smi.Dispose(); } } SegmentMergeInfo[] match = new SegmentMergeInfo[readers.Count]; System.String currentField = null; FormatPostingsTermsConsumer termsConsumer = null; while (queue.Size() > 0) { int matchSize = 0; // pop matching terms match[matchSize++] = queue.Pop(); Term term = match[0].term; SegmentMergeInfo top = queue.Top(); while (top != null && term.CompareTo(top.term) == 0) { match[matchSize++] = queue.Pop(); top = queue.Top(); } if ((System.Object)currentField != (System.Object)term.Field) { currentField = term.Field; if (termsConsumer != null) { termsConsumer.Finish(); } FieldInfo fieldInfo = fieldInfos.FieldInfo(currentField); termsConsumer = consumer.AddField(fieldInfo); omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; } int df = AppendPostings(termsConsumer, match, matchSize); // add new TermInfo checkAbort.Work(df / 3.0); while (matchSize > 0) { SegmentMergeInfo smi = match[--matchSize]; if (smi.Next()) { queue.Add(smi); } // restore queue else { smi.Dispose(); // done with a segment } } } }
private static void FillCache(IndexSearcherHolder.IndexSearcherHoldingState state, IEnumerable<string> fieldsToRead,IndexReader reader) { foreach (var field in fieldsToRead) { var items = new LinkedList<IndexSearcherHolder.IndexSearcherHoldingState.CacheVal>[reader.MaxDoc]; using (var termDocs = reader.TermDocs()) { using (var termEnum = reader.Terms(new Term(field))) { do { if (termEnum.Term == null || field != termEnum.Term.Field) break; Term term = termEnum.Term; if (LowPrecisionNumber(term.Field, term.Text)) continue; var totalDocCountIncludedDeletes = termEnum.DocFreq(); termDocs.Seek(termEnum.Term); while (termDocs.Next() && totalDocCountIncludedDeletes > 0) { totalDocCountIncludedDeletes -= 1; if (reader.IsDeleted(termDocs.Doc)) continue; if(items[termDocs.Doc] == null) items[termDocs.Doc] = new LinkedList<IndexSearcherHolder.IndexSearcherHoldingState.CacheVal>(); items[termDocs.Doc].AddLast(new IndexSearcherHolder.IndexSearcherHoldingState.CacheVal { Term = termEnum.Term }); } } while (termEnum.Next()); } } state.SetInCache(field, items); } }
public static RavenJObject[] ReadAllEntriesFromIndex(IndexReader reader) { if (reader.MaxDoc > 512 * 1024) { throw new InvalidOperationException("Refusing to extract all index entires from an index with " + reader.MaxDoc + " entries, because of the probable time / memory costs associated with that." + Environment.NewLine + "Viewing Index Entries are a debug tool, and should not be used on indexes of this size. You might want to try Luke, instead."); } var results = new RavenJObject[reader.MaxDoc]; using (var termDocs = reader.TermDocs()) using (var termEnum = reader.Terms()) { while (termEnum.Next()) { var term = termEnum.Term; if (term == null) break; var text = term.Text; termDocs.Seek(termEnum); for (int i = 0; i < termEnum.DocFreq() && termDocs.Next(); i++) { RavenJObject result = results[termDocs.Doc]; if (result == null) results[termDocs.Doc] = result = new RavenJObject(); var propertyName = term.Field; if (propertyName.EndsWith("_ConvertToJson") || propertyName.EndsWith("_IsArray")) continue; if (result.ContainsKey(propertyName)) { switch (result[propertyName].Type) { case JTokenType.Array: ((RavenJArray)result[propertyName]).Add(text); break; case JTokenType.String: result[propertyName] = new RavenJArray { result[propertyName], text }; break; default: throw new ArgumentException("No idea how to handle " + result[propertyName].Type); } } else { result[propertyName] = text; } } } } return results; }
/// <summary> /// loads multi-value facet data. This method uses a workarea to prepare loading. /// </summary> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <param name="listFactory"></param> /// <param name="workArea"></param> public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea) { long t0 = Environment.TickCount; int maxdoc = reader.MaxDoc; BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); TermEnum tenum = null; TermDocs tdoc = null; ITermValueList list = (listFactory == null ? (ITermValueList)new TermStringList() : listFactory.CreateTermList()); List<int> minIDList = new List<int>(); List<int> maxIDList = new List<int>(); List<int> freqList = new List<int>(); OpenBitSet bitset = new OpenBitSet(); int negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName)); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; _overflow = false; try { tdoc = reader.TermDocs(); tenum = reader.Terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.Term; if (term == null || !fieldName.Equals(term.Field)) break; string val = term.Text; if (val != null) { list.Add(val); tdoc.Seek(tenum); //freqList.add(tenum.docFreq()); // removed because the df doesn't take into account the num of deletedDocs int df = 0; int minID = -1; int maxID = -1; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; if (tdoc.Next()) { df++; int docid = tdoc.Doc; if (!loader.Add(docid, valId)) LogOverflow(fieldName); minID = docid; bitset.Set(docid); while (tdoc.Next()) { df++; docid = tdoc.Doc; if (!loader.Add(docid, valId)) LogOverflow(fieldName); bitset.Set(docid); } maxID = docid; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } t++; } while (tenum.Next()); } } finally { try { if (tdoc != null) { tdoc.Dispose(); } } finally { if (tenum != null) { tenum.Dispose(); } } } list.Seal(); try { _nestedArray.Load(maxdoc + 1, loader); } catch (System.IO.IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.Contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality(); }
// There are two ways we can determine the max_results // most recent items: // // One is to instantiate Lucene documents for each of // the document IDs in primary_matches. This is a // fairly expensive operation. // // The other is to walk through the list of all // document IDs in descending time order. This is // a less expensive operation, but adds up over time // on large data sets. // // We can walk about 2.5 docs for every Document we // instantiate. So what we'll do, if we have more // matches than available hits, is walk (m * 1.25) // docs to see if we can fill out the top 100 hits. // If not, we'll fall back to creating documents // for all of them. private static ArrayList ScanRecentDocs (IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary<int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch a = new Stopwatch (); a.Start (); TermDocs docs = primary_reader.TermDocs (); TermEnum enumerator = primary_reader.Terms (new Term ("InvertedTimestamp", String.Empty)); ArrayList results = new ArrayList (max_results); int docs_found = 0; int docs_walked = 0; int hit_filter_removed = 0; int max_docs = (int) (primary_matches.TrueCount * 1.25); Term term; TermDocs secondary_term_docs = null; if (secondary_reader != null) secondary_term_docs = secondary_reader.TermDocs (); do { term = enumerator.Term (); if (term.Field () != "InvertedTimestamp") break; docs.Seek (enumerator); while (docs.Next () && docs_found < max_results && docs_walked < max_docs) { int doc_id = docs.Doc (); if (primary_matches.Get (doc_id)) { Document doc = primary_reader.Document (doc_id); Hit hit = CreateHit (doc, secondary_reader, secondary_term_docs); // If we have a HitFilter, apply it. if (hit_filter != null && ! hit_filter (hit)) { if (Debug) Log.Debug ("Filtered out {0}", hit.Uri); hit_filter_removed ++; continue; } hits_by_id [doc_id] = hit; // Add the result, last modified first results.Add (hit); docs_found++; } docs_walked++; } } while (enumerator.Next () && docs_found < max_results && docs_walked < max_docs); docs.Close (); if (secondary_term_docs != null) secondary_term_docs.Close (); // If we've found all the docs we can return in a subset! // Fantastic, we've probably short circuited a slow search. if (docs_found != max_results) { // Otherwise bad luck! Not all docs found // Start afresh - this time traversing all results results = null; } else { // Adjust total_number_of_matches. We need to do this to avoid scenarios like the following: // max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned. // We want to avoid saying "Showing top 70 of 100". Note that since we are not passing // every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the // 1234 could actually be much less. But since max_hits was 100, that will not mislead the user. total_number_of_matches -= hit_filter_removed; } a.Stop (); if (Debug) { Log.Debug (">>> {0}: Walked {1} items, populated an enum with {2} items in {3}", index_name, docs_walked, docs_found, a); if (docs_found == max_results) Log.Debug (">>> {0}: Successfully short circuited timestamp ordering!", index_name); } return results; }
private OpenBitSet FastBits(IndexReader reader) { OpenBitSet bits = new OpenBitSet(reader.MaxDoc()); bits.Set(0, reader.MaxDoc()); //assume all are valid Term startTerm = new Term(fieldName); TermEnum te = reader.Terms(startTerm); if (te != null) { Term currTerm = te.Term(); while ((currTerm != null) && (currTerm.Field() == startTerm.Field())) //term fieldnames are interned { if (te.DocFreq() > 1) { int lastDoc = -1; //unset potential duplicates TermDocs td = reader.TermDocs(currTerm); td.Next(); if (keepMode == KM_USE_FIRST_OCCURRENCE) { td.Next(); } do { lastDoc = td.Doc(); bits.Clear(lastDoc); } while (td.Next()); if (keepMode == KM_USE_LAST_OCCURRENCE) { //restore the last bit bits.Set(lastDoc); } } if (!te.Next()) { break; } currTerm = te.Term(); } } return bits; }
public static void VerifyEquals(IndexReader r1, IndexReader r2, System.String idField) { Assert.AreEqual(r1.NumDocs(), r2.NumDocs()); bool hasDeletes = !(r1.MaxDoc() == r2.MaxDoc() && r1.NumDocs() == r1.MaxDoc()); int[] r2r1 = new int[r2.MaxDoc()]; // r2 id to r1 id mapping TermDocs termDocs1 = r1.TermDocs(); TermDocs termDocs2 = r2.TermDocs(); // create mapping from id2 space to id2 based on idField idField = StringHelper.Intern(idField); TermEnum termEnum = r1.Terms(new Term(idField, "")); do { Term term = termEnum.Term(); if (term == null || (System.Object)term.Field() != (System.Object)idField) { break; } termDocs1.Seek(termEnum); if (!termDocs1.Next()) { // This doc is deleted and wasn't replaced termDocs2.Seek(termEnum); Assert.IsFalse(termDocs2.Next()); continue; } int id1 = termDocs1.Doc(); Assert.IsFalse(termDocs1.Next()); termDocs2.Seek(termEnum); Assert.IsTrue(termDocs2.Next()); int id2 = termDocs2.Doc(); Assert.IsFalse(termDocs2.Next()); r2r1[id2] = id1; // verify stored fields are equivalent try { VerifyEquals(r1.Document(id1), r2.Document(id2)); } catch (System.Exception t) { System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term); System.Console.Out.WriteLine(" d1=" + r1.Document(id1)); System.Console.Out.WriteLine(" d2=" + r2.Document(id2)); throw t; } try { // verify term vectors are equivalent VerifyEquals(r1.GetTermFreqVectors(id1), r2.GetTermFreqVectors(id2)); } catch (System.Exception e) { System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2); TermFreqVector[] tv1 = r1.GetTermFreqVectors(id1); System.Console.Out.WriteLine(" d1=" + tv1); if (tv1 != null) { for (int i = 0; i < tv1.Length; i++) { System.Console.Out.WriteLine(" " + i + ": " + tv1[i]); } } TermFreqVector[] tv2 = r2.GetTermFreqVectors(id2); System.Console.Out.WriteLine(" d2=" + tv2); if (tv2 != null) { for (int i = 0; i < tv2.Length; i++) { System.Console.Out.WriteLine(" " + i + ": " + tv2[i]); } } throw e; } }while (termEnum.Next()); termEnum.Close(); // Verify postings TermEnum termEnum1 = r1.Terms(new Term("", "")); TermEnum termEnum2 = r2.Terms(new Term("", "")); // pack both doc and freq into single element for easy sorting long[] info1 = new long[r1.NumDocs()]; long[] info2 = new long[r2.NumDocs()]; for (; ;) { Term term1, term2; // iterate until we get some docs int len1; for (; ;) { len1 = 0; term1 = termEnum1.Term(); if (term1 == null) { break; } termDocs1.Seek(termEnum1); while (termDocs1.Next()) { int d1 = termDocs1.Doc(); int f1 = termDocs1.Freq(); info1[len1] = (((long)d1) << 32) | f1; len1++; } if (len1 > 0) { break; } if (!termEnum1.Next()) { break; } } // iterate until we get some docs int len2; for (; ;) { len2 = 0; term2 = termEnum2.Term(); if (term2 == null) { break; } termDocs2.Seek(termEnum2); while (termDocs2.Next()) { int d2 = termDocs2.Doc(); int f2 = termDocs2.Freq(); info2[len2] = (((long)r2r1[d2]) << 32) | f2; len2++; } if (len2 > 0) { break; } if (!termEnum2.Next()) { break; } } if (!hasDeletes) { Assert.AreEqual(termEnum1.DocFreq(), termEnum2.DocFreq()); } Assert.AreEqual(len1, len2); if (len1 == 0) { break; // no more terms } Assert.AreEqual(term1, term2); // sort info2 to get it into ascending docid System.Array.Sort(info2, 0, len2 - 0); // now compare for (int i = 0; i < len1; i++) { Assert.AreEqual(info1[i], info2[i]); } termEnum1.Next(); termEnum2.Next(); } }
public TermsEnumCompatibility(IndexReader reader, String fieldName) { this.reader = reader; this.fieldName = string.Intern(fieldName); this.termEnum = reader.Terms(new Term(this.fieldName)); }
public static void AssertIndexEquals(IndexReader index1, IndexReader index2) { Assert.AreEqual(index1.NumDocs(), index2.NumDocs(), "IndexReaders have different values for numDocs."); Assert.AreEqual(index1.MaxDoc, index2.MaxDoc, "IndexReaders have different values for maxDoc."); Assert.AreEqual(index1.HasDeletions, index2.HasDeletions, "Only one IndexReader has deletions."); Assert.AreEqual(index1.IsOptimized(), index2.IsOptimized(), "Only one index is optimized."); // check field names System.Collections.Generic.ICollection<string> fieldsNames1 = index1.GetFieldNames(FieldOption.ALL); System.Collections.Generic.ICollection<string> fieldsNames2 = index1.GetFieldNames(FieldOption.ALL); System.Collections.Generic.ICollection<IFieldable> fields1 = null; System.Collections.Generic.ICollection<IFieldable> fields2 = null; Assert.AreEqual(fieldsNames1.Count, fieldsNames2.Count, "IndexReaders have different numbers of fields."); System.Collections.IEnumerator it1 = fieldsNames1.GetEnumerator(); System.Collections.IEnumerator it2 = fieldsNames2.GetEnumerator(); while (it1.MoveNext() && it2.MoveNext()) { Assert.AreEqual((System.String) it1.Current, (System.String) it2.Current, "Different field names."); } // check norms it1 = fieldsNames1.GetEnumerator(); while (it1.MoveNext()) { System.String curField = (System.String) it1.Current; byte[] norms1 = index1.Norms(curField); byte[] norms2 = index2.Norms(curField); if (norms1 != null && norms2 != null) { Assert.AreEqual(norms1.Length, norms2.Length); for (int i = 0; i < norms1.Length; i++) { Assert.AreEqual(norms1[i], norms2[i], "Norm different for doc " + i + " and field '" + curField + "'."); } } else { Assert.AreSame(norms1, norms2); } } // check deletions for (int i = 0; i < index1.MaxDoc; i++) { Assert.AreEqual(index1.IsDeleted(i), index2.IsDeleted(i), "Doc " + i + " only deleted in one index."); } // check stored fields for (int i = 0; i < index1.MaxDoc; i++) { if (!index1.IsDeleted(i)) { Document doc1 = index1.Document(i); Document doc2 = index2.Document(i); fields1 = doc1.GetFields(); fields2 = doc2.GetFields(); Assert.AreEqual(fields1.Count, fields2.Count, "Different numbers of fields for doc " + i + "."); it1 = fields1.GetEnumerator(); it2 = fields2.GetEnumerator(); while (it1.MoveNext() && it2.MoveNext()) { Field curField1 = (Field) it1.Current; Field curField2 = (Field) it2.Current; Assert.AreEqual(curField1.Name, curField2.Name, "Different fields names for doc " + i + "."); Assert.AreEqual(curField1.StringValue, curField2.StringValue, "Different field values for doc " + i + "."); } } } // check dictionary and posting lists TermEnum enum1 = index1.Terms(); TermEnum enum2 = index2.Terms(); TermPositions tp1 = index1.TermPositions(); TermPositions tp2 = index2.TermPositions(); while (enum1.Next()) { Assert.IsTrue(enum2.Next()); Assert.AreEqual(enum1.Term, enum2.Term, "Different term in dictionary."); tp1.Seek(enum1.Term); tp2.Seek(enum1.Term); while (tp1.Next()) { Assert.IsTrue(tp2.Next()); Assert.AreEqual(tp1.Doc, tp2.Doc, "Different doc id in postinglist of term " + enum1.Term + "."); Assert.AreEqual(tp1.Freq, tp2.Freq, "Different term frequence in postinglist of term " + enum1.Term + "."); for (int i = 0; i < tp1.Freq; i++) { Assert.AreEqual(tp1.NextPosition(), tp2.NextPosition(), "Different positions in postinglist of term " + enum1.Term + "."); } } } }
public override TermEnum Terms() { EnsureOpen(); return(in_Renamed.Terms()); }
/* * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs * * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that * exceed the required document frequency * @param fieldName The field for which stopwords will be added * @param maxDocFreq The maximum number of index documents which * can contain a term, after which the term is considered to be a stop word. * @return The number of stop words identified. * @throws IOException */ public int AddStopWords(IndexReader reader, String fieldName, int maxDocFreq) { var stopWords = Support.Compatibility.SetFactory.CreateHashSet<string>(); String internedFieldName = StringHelper.Intern(fieldName); TermEnum te = reader.Terms(new Term(fieldName)); Term term = te.Term; while (term != null) { if (term.Field != internedFieldName) { break; } if (te.DocFreq() > maxDocFreq) { stopWords.Add(term.Text); } if (!te.Next()) { break; } term = te.Term; } stopWordsPerField.Add(fieldName, stopWords); /* if the stopwords for a field are changed, * then saved streams for that field are erased. */ IDictionary<String,SavedStreams> streamMap = (IDictionary<String,SavedStreams>) PreviousTokenStream; if (streamMap != null) streamMap.Remove(fieldName); return stopWords.Count; }
public override void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea) { long t0 = System.Environment.TickCount; int maxdoc = reader.MaxDoc; BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); BigNestedIntArray.BufferedLoader weightLoader = GetBufferedLoader(maxdoc, null); TermEnum tenum = null; TermDocs tdoc = null; var list = (listFactory == null ? new TermStringList() : listFactory.CreateTermList()); List<int> minIDList = new List<int>(); List<int> maxIDList = new List<int>(); List<int> freqList = new List<int>(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName)); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; _overflow = false; string pre = null; int df = 0; int minID = -1; int maxID = -1; int valId = 0; try { tdoc = reader.TermDocs(); tenum = reader.Terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.Term; if (term == null || !fieldName.Equals(term.Field)) break; string val = term.Text; if (val != null) { int weight = 0; string[] split = val.Split(new char[] { '\0' }, StringSplitOptions.RemoveEmptyEntries); if (split.Length > 1) { val = split[0]; weight = int.Parse(split[split.Length - 1]); } if (pre == null || !val.Equals(pre)) { if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } list.Add(val); df = 0; minID = -1; maxID = -1; valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; t++; } tdoc.Seek(tenum); if (tdoc.Next()) { df++; int docid = tdoc.Doc; if (!loader.Add(docid, valId)) LogOverflow(fieldName); else weightLoader.Add(docid, weight); if (docid < minID) minID = docid; bitset.FastSet(docid); while (tdoc.Next()) { df++; docid = tdoc.Doc; if (!loader.Add(docid, valId)) LogOverflow(fieldName); else weightLoader.Add(docid, weight); bitset.FastSet(docid); } if (docid > maxID) maxID = docid; } pre = val; } } while (tenum.Next()); if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } } } finally { try { if (tdoc != null) { tdoc.Dispose(); } } finally { if (tenum != null) { tenum.Dispose(); } } } list.Seal(); try { _nestedArray.Load(maxdoc + 1, loader); _weightArray.Load(maxdoc + 1, weightLoader); } catch (System.IO.IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.Contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality(); }
private static void FillCache(IndexSearcherHolder.IndexSearcherHoldingState state, List<string> fieldsToRead,IndexReader reader) { foreach (var field in fieldsToRead) { using (var termDocs = reader.TermDocs()) { using (var termEnum = reader.Terms(new Term(field))) { do { if (termEnum.Term == null || field != termEnum.Term.Field) break; if (LowPrecisionNumber(termEnum.Term)) continue; var totalDocCountIncludedDeletes = termEnum.DocFreq(); termDocs.Seek(termEnum.Term); while (termDocs.Next() && totalDocCountIncludedDeletes > 0) { totalDocCountIncludedDeletes -= 1; if (reader.IsDeleted(termDocs.Doc)) continue; state.SetInCache(field, termDocs.Doc, termEnum.Term); } } while (termEnum.Next()); } } } }
public virtual void TestPhrasePrefix() { RAMDirectory indexStore = new RAMDirectory(); IndexWriter writer = new IndexWriter(indexStore, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED, null); Add("blueberry pie", writer); Add("blueberry strudel", writer); Add("blueberry pizza", writer); Add("blueberry chewing gum", writer); Add("bluebird pizza", writer); Add("bluebird foobar pizza", writer); Add("piccadilly circus", writer); writer.Optimize(null); writer.Close(); IndexSearcher searcher = new IndexSearcher(indexStore, true, null); // search for "blueberry pi*": MultiPhraseQuery query1 = new MultiPhraseQuery(); // search for "strawberry pi*": MultiPhraseQuery query2 = new MultiPhraseQuery(); query1.Add(new Term("body", "blueberry")); query2.Add(new Term("body", "strawberry")); System.Collections.ArrayList termsWithPrefix = new System.Collections.ArrayList(); IndexReader ir = IndexReader.Open((Directory)indexStore, true, null); // this TermEnum gives "piccadilly", "pie" and "pizza". System.String prefix = "pi"; TermEnum te = ir.Terms(new Term("body", prefix), null); do { if (te.Term.Text.StartsWith(prefix)) { termsWithPrefix.Add(te.Term); } }while (te.Next(null)); query1.Add((Term[])termsWithPrefix.ToArray(typeof(Term))); Assert.AreEqual("body:\"blueberry (piccadilly pie pizza)\"", query1.ToString()); query2.Add((Term[])termsWithPrefix.ToArray(typeof(Term))); Assert.AreEqual("body:\"strawberry (piccadilly pie pizza)\"", query2.ToString()); ScoreDoc[] result; result = searcher.Search(query1, null, 1000, null).ScoreDocs; Assert.AreEqual(2, result.Length); result = searcher.Search(query2, null, 1000, null).ScoreDocs; Assert.AreEqual(0, result.Length); // search for "blue* pizza": MultiPhraseQuery query3 = new MultiPhraseQuery(); termsWithPrefix.Clear(); prefix = "blue"; te = ir.Terms(new Term("body", prefix), null); do { if (te.Term.Text.StartsWith(prefix)) { termsWithPrefix.Add(te.Term); } }while (te.Next(null)); query3.Add((Term[])termsWithPrefix.ToArray(typeof(Term))); query3.Add(new Term("body", "pizza")); result = searcher.Search(query3, null, 1000, null).ScoreDocs; Assert.AreEqual(2, result.Length); // blueberry pizza, bluebird pizza Assert.AreEqual("body:\"(blueberry bluebird) pizza\"", query3.ToString()); // test slop: query3.Slop = 1; result = searcher.Search(query3, null, 1000, null).ScoreDocs; Assert.AreEqual(3, result.Length); // blueberry pizza, bluebird pizza, bluebird foobar pizza MultiPhraseQuery query4 = new MultiPhraseQuery(); // okay, all terms must belong to the same field Assert.Throws <ArgumentException>(() => { query4.Add(new Term("field1", "foo")); query4.Add(new Term("field2", "foobar")); }); searcher.Close(); indexStore.Close(); }
public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory) { string field = string.Intern(fieldName); int maxDoc = reader.MaxDoc; if (orderArray == null) // we want to reuse the memory { orderArray = NewInstance(termCountSize, maxDoc); } else { orderArray.EnsureCapacity(maxDoc); // no need to fill to 0, we are reseting the data anyway } List<int> minIDList = new List<int>(); List<int> maxIDList = new List<int>(); List<int> freqList = new List<int>(); int length = maxDoc + 1; ITermValueList list = listFactory == null ? new TermStringList() : listFactory.CreateTermList(); TermDocs termDocs = reader.TermDocs(); TermEnum termEnum = reader.Terms(new Term(field)); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); //int df = 0; t++; try { do { Term term = termEnum.Term; if (term == null || string.CompareOrdinal(term.Field, field) != 0) break; if (t >= orderArray.MaxValue()) { throw new System.IO.IOException("maximum number of value cannot exceed: " + orderArray.MaxValue()); } // Alexey: well, we could get now more than one term per document. Effectively, we could build facet againsts tokenized field /*// we expect that there is at most one term per document if (t >= length) { throw new RuntimeException("there are more terms than " + "documents in field \"" + field + "\", but it's impossible to sort on " + "tokenized fields"); }*/ // store term text list.Add(term.Text); termDocs.Seek(termEnum); // freqList.add(termEnum.docFreq()); // doesn't take into account deldocs int minID = -1; int maxID = -1; int df = 0; if (termDocs.Next()) { df++; int docid = termDocs.Doc; orderArray.Add(docid, t); minID = docid; while (termDocs.Next()) { df++; docid = termDocs.Doc; orderArray.Add(docid, t); } maxID = docid; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); t++; } while (termEnum.Next()); } finally { termDocs.Dispose(); termEnum.Dispose(); } list.Seal(); this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); }
public override TermEnum Terms() { return(in_Renamed.Terms()); }