public override int Advance(int target) { if (_td.SkipTo(target)) { _doc = _td.Doc; } else { _td.Dispose(); _doc = DocIdSetIterator.NO_MORE_DOCS; } return(_doc); }
/// <summary> /// loads multi-value facet data. This method uses a workarea to prepare loading. /// </summary> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <param name="listFactory"></param> /// <param name="workArea"></param> public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea) { long t0 = Environment.TickCount; int maxdoc = reader.MaxDoc; BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); TermEnum tenum = null; TermDocs tdoc = null; ITermValueList list = (listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(); int negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName)); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; _overflow = false; try { tdoc = reader.TermDocs(); tenum = reader.Terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.Term; if (term == null || !fieldName.Equals(term.Field)) { break; } string val = term.Text; if (val != null) { list.Add(val); tdoc.Seek(tenum); //freqList.add(tenum.docFreq()); // removed because the df doesn't take into account the num of deletedDocs int df = 0; int minID = -1; int maxID = -1; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; if (tdoc.Next()) { df++; int docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } minID = docid; bitset.Set(docid); while (tdoc.Next()) { df++; docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } bitset.Set(docid); } maxID = docid; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } t++; }while (tenum.Next()); } } finally { try { if (tdoc != null) { tdoc.Dispose(); } } finally { if (tenum != null) { tenum.Dispose(); } } } list.Seal(); try { _nestedArray.Load(maxdoc + 1, loader); } catch (System.IO.IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.Contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality(); }
public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory) { string field = string.Intern(fieldName); int maxDoc = reader.MaxDoc; BigSegmentedArray order = this.orderArray; if (order == null) // we want to reuse the memory { int dictValueCount = GetDictValueCount(reader, fieldName); order = NewInstance(dictValueCount, maxDoc); } else { order.EnsureCapacity(maxDoc); // no need to fill to 0, we are reseting the // data anyway } this.orderArray = order; List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); int length = maxDoc + 1; ITermValueList list = listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList(); int negativeValueCount = GetNegativeValueCount(reader, field); TermDocs termDocs = reader.TermDocs(); TermEnum termEnum = reader.Terms(new Term(field, "")); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); int totalFreq = 0; //int df = 0; t++; try { do { Term term = termEnum.Term; if (term == null || string.CompareOrdinal(term.Field, field) != 0) { break; } // store term text // we expect that there is at most one term per document // Alexey: well, we could get now more than one term per document. Effectively, we could build facet against tokenized field //if (t >= length) //{ // throw new RuntimeException("there are more terms than " + "documents in field \"" + field // + "\", but it's impossible to sort on " + "tokenized fields"); //} list.Add(term.Text); termDocs.Seek(termEnum); // freqList.add(termEnum.docFreq()); // doesn't take into account deldocs int minID = -1; int maxID = -1; int df = 0; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; if (termDocs.Next()) { df++; int docid = termDocs.Doc; order.Add(docid, valId); minID = docid; while (termDocs.Next()) { df++; docid = termDocs.Doc; order.Add(docid, valId); } maxID = docid; } freqList.Add(df); totalFreq += df; minIDList.Add(minID); maxIDList.Add(maxID); t++; } while (termEnum.Next()); } finally { termDocs.Dispose(); termEnum.Dispose(); } list.Seal(); this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxDoc && order.Get(doc) != 0) { ++doc; } if (doc <= maxDoc) { this.minIDs[0] = doc; // Try to get the max doc = maxDoc; while (doc > 0 && order.Get(doc) != 0) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxDoc + 1 - totalFreq; }
public override void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea) { long t0 = System.Environment.TickCount; int maxdoc = reader.MaxDoc; BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); BigNestedIntArray.BufferedLoader weightLoader = GetBufferedLoader(maxdoc, null); TermEnum tenum = null; TermDocs tdoc = null; var list = (listFactory == null ? new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName)); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; _overflow = false; string pre = null; int df = 0; int minID = -1; int maxID = -1; int valId = 0; try { tdoc = reader.TermDocs(); tenum = reader.Terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.Term; if (term == null || !fieldName.Equals(term.Field)) { break; } string val = term.Text; if (val != null) { int weight = 0; string[] split = val.Split(new char[] { '\0' }, StringSplitOptions.RemoveEmptyEntries); if (split.Length > 1) { val = split[0]; weight = int.Parse(split[split.Length - 1]); } if (pre == null || !val.Equals(pre)) { if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } list.Add(val); df = 0; minID = -1; maxID = -1; valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; t++; } tdoc.Seek(tenum); if (tdoc.Next()) { df++; int docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } else { weightLoader.Add(docid, weight); } if (docid < minID) { minID = docid; } bitset.FastSet(docid); while (tdoc.Next()) { df++; docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } else { weightLoader.Add(docid, weight); } bitset.FastSet(docid); } if (docid > maxID) { maxID = docid; } } pre = val; } }while (tenum.Next()); if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } } } finally { try { if (tdoc != null) { tdoc.Dispose(); } } finally { if (tenum != null) { tenum.Dispose(); } } } list.Seal(); try { _nestedArray.Load(maxdoc + 1, loader); _weightArray.Load(maxdoc + 1, weightLoader); } catch (System.IO.IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.Contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality(); }
public override FacetDataCache Load(BoboIndexReader reader) { int maxDoc = reader.MaxDoc; BigIntArray order = new BigIntArray(maxDoc); ITermValueList mterms = _termListFactory == null ? new TermStringList() : _termListFactory.CreateTermList(); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); TermDocs termDocs = null; TermEnum termEnum = null; int t = 0; // current term number mterms.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; try { termDocs = reader.TermDocs(); termEnum = reader.Terms(new Term(_indexFieldName, "")); do { if (termEnum == null) { break; } Term term = termEnum.Term; if (term == null || !_indexFieldName.Equals(term.Field)) { break; } // store term text // we expect that there is at most one term per document if (t > MAX_VAL_COUNT) { throw new IOException("maximum number of value cannot exceed: " + MAX_VAL_COUNT); } string val = term.Text; mterms.Add(val); int bit = (0x00000001 << (t - 1)); termDocs.Seek(termEnum); //freqList.add(termEnum.docFreq()); // removed because the df doesn't take into account the num of deletedDocs int df = 0; int minID = -1; int maxID = -1; if (termDocs.Next()) { df++; int docid = termDocs.Doc; order.Add(docid, order.Get(docid) | bit); minID = docid; while (termDocs.Next()) { df++; docid = termDocs.Doc; order.Add(docid, order.Get(docid) | bit); } maxID = docid; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); t++; } while (termEnum.Next()); } finally { try { if (termDocs != null) { termDocs.Dispose(); } } finally { if (termEnum != null) { termEnum.Dispose(); } } } mterms.Seal(); return(new FacetDataCache(order, mterms, freqList.ToArray(), minIDList.ToArray(), maxIDList.ToArray(), TermCountSize.Large)); }
public override FacetDataCache Load(BoboIndexReader reader) { int doc = -1; C5.TreeDictionary <object, List <int> > dataMap = null; List <int> docList = null; int nullMinId = -1; int nullMaxId = -1; int nullFreq = 0; TermDocs termDocs = reader.TermDocs(null); try { while (termDocs.Next()) { doc = termDocs.Doc; object val = _facetDataFetcher.Fetch(reader, doc); if (val == null) { if (nullMinId < 0) { nullMinId = doc; } nullMaxId = doc; ++nullFreq; continue; } if (dataMap == null) { // Initialize. if (val is long[]) { if (_termListFactory == null) { _termListFactory = new TermFixedLengthLongArrayListFactory( ((long[])val).Length); } dataMap = new C5.TreeDictionary <object, List <int> >(new VirtualSimpleFacetHandlerLongArrayComparator()); } else if (val is IComparable) { // NOTE: In .NET 3.5, the default constructor doesn't work in this case. We therefore have a custom type // that converts the objects to IComparable before comparing them, falling back to a string comparison // if they don't convert. This differs from the Java implementation that uses the default constructor. dataMap = new C5.TreeDictionary <object, List <int> >(new VirtualSimpleFacetHandlerComparableComparator()); } else { dataMap = new C5.TreeDictionary <object, List <int> >(new VirtualSimpleFacetHandlerObjectComparator()); } } if (dataMap.Contains(val)) { docList = dataMap[val]; } else { docList = null; } if (docList == null) { docList = new List <int>(); dataMap[val] = docList; } docList.Add(doc); } } finally { termDocs.Dispose(); } _facetDataFetcher.Cleanup(reader); int maxDoc = reader.MaxDoc; int size = dataMap == null ? 1 : (dataMap.Count + 1); BigSegmentedArray order = new BigIntArray(maxDoc); ITermValueList list = _termListFactory == null ? new TermStringList(size) : _termListFactory.CreateTermList(size); int[] freqs = new int[size]; int[] minIDs = new int[size]; int[] maxIDs = new int[size]; list.Add(null); freqs[0] = nullFreq; minIDs[0] = nullMinId; maxIDs[0] = nullMaxId; if (dataMap != null) { int i = 1; int?docId; foreach (var entry in dataMap) { list.Add(list.Format(entry.Key)); docList = entry.Value; freqs[i] = docList.Count; minIDs[i] = docList.Get(0, int.MinValue); while ((docId = docList.Poll(int.MinValue)) != int.MinValue) { doc = (int)docId; order.Add(doc, i); } maxIDs[i] = doc; ++i; } } list.Seal(); FacetDataCache dataCache = new FacetDataCache(order, list, freqs, minIDs, maxIDs, TermCountSize.Large); return(dataCache); }