public FacetDataCache() { this.orderArray = null; this.valArray = null; this.maxIDs = null; this.minIDs = null; this.freqs = null; }
public FacetDataCache() { this.m_orderArray = null; this.m_valArray = null; this.m_maxIDs = null; this.m_minIDs = null; this.m_freqs = null; }
public FacetDataCache(BigSegmentedArray orderArray, ITermValueList valArray, int[] freqs, int[] minIDs, int[] maxIDs, FacetHandler.TermCountSize termCountSize) { this.orderArray = orderArray; this.valArray = valArray; this.freqs = freqs; this.minIDs = minIDs; this.maxIDs = maxIDs; this.termCountSize = termCountSize; }
public FacetDataCache() { this.orderArray = null; this.valArray = null; this.maxIDs = null; this.minIDs = null; this.freqs = null; termCountSize = FacetHandler.TermCountSize.Large; }
public FacetDataCache(BigSegmentedArray orderArray, ITermValueList valArray, int[] freqs, int[] minIDs, int[] maxIDs, TermCountSize termCountSize) { this.orderArray = orderArray; this.valArray = valArray; this.freqs = freqs; this.minIDs = minIDs; this.maxIDs = maxIDs; }
public HistogramCollector(string facetName, IFacetCountCollector baseCollector, FacetDataCache dataCache, FacetSpec ospec, T start, T end, T unit) { m_facetName = facetName; m_baseCollector = baseCollector; m_valArray = dataCache.ValArray; m_ospec = ospec; m_isAggregated = false; m_start = start; m_end = end; m_unit = unit; m_count = new LazyBigInt32Array(CountArraySize()); }
public DefaultFacetIterator(ITermValueList valList, BigSegmentedArray counts, int countlength, bool zeroBased) { _valList = valList; _count = counts; _countlength = countlength; _index = -1; _lastIndex = _countlength - 1; if (!zeroBased) _index++; facet = null; count = 0; }
public DefaultFacetIterator(ITermValueList valList, BigSegmentedArray countarray, int countlength, bool zeroBased) { m_valList = valList; m_count = countarray; m_countlength = countlength; m_index = -1; m_countLengthMinusOne = m_countlength - 1; if (!zeroBased) { m_index++; } m_facet = null; base.m_count = 0; }
public DefaultFacetIterator(ITermValueList valList, BigSegmentedArray counts, int countlength, bool zeroBased) { _valList = valList; _count = counts; _countlength = countlength; _index = -1; _lastIndex = _countlength - 1; if (!zeroBased) { _index++; } facet = null; count = 0; }
public override RandomAccessDocIdSet GetRandomAccessDocIdSet(BoboIndexReader reader) { RandomAccessDocIdSet innerDocSet = _facetFilter.GetRandomAccessDocIdSet(reader); if (innerDocSet == EmptyDocIdSet.Instance) { return(innerDocSet); } FacetDataCache dataCache = _facetDataCacheBuilder.Build(reader); int totalCount = reader.MaxDoc; ITermValueList valArray = dataCache.ValArray; int freqCount = 0; var validVals = new List <string>(_valSet.Count()); foreach (string val in _valSet) { int idx = valArray.IndexOf(val); if (idx >= 0) { validVals.Add(valArray.Get(idx)); // get and format the value freqCount += dataCache.Freqs[idx]; } } if (validVals.Count == 0) { return(EmptyDocIdSet.Instance); } // takeComplement is only used to choose between TermListRandomAccessDocIdSet and innerDocSet int validFreqCount = _takeComplement ? (totalCount - freqCount) : freqCount; if (_facetDataCacheBuilder.IndexFieldName != null && ((validFreqCount << 1) < totalCount)) { return(new TermListRandomAccessDocIdSet(_facetDataCacheBuilder.IndexFieldName, innerDocSet, validVals, reader)); } else { return(innerDocSet); } }
private BigSegmentedArray GetCollapsedCounts() { if (m_collapsedCounts == null) { m_collapsedCounts = new LazyBigInt32Array(m_bucketValues.Count); FacetDataCache dataCache = m_subCollector.DataCache; ITermValueList subList = dataCache.ValArray; BigSegmentedArray subcounts = m_subCollector.Count; FixedBitSet indexSet = new FixedBitSet(subcounts.Length); int c = 0; int i = 0; foreach (string val in m_bucketValues) { if (val.Length > 0) { string[] subVals = m_predefinedBuckets.Get(val); int count = 0; foreach (string subVal in subVals) { int index = subList.IndexOf(subVal); if (index > 0) { int subcount = subcounts.Get(index); count += subcount; if (!indexSet.Get(index)) { indexSet.Set(index); c += dataCache.Freqs[index]; } } } m_collapsedCounts.Add(i, count); } i++; } m_collapsedCounts.Add(0, (m_numdocs - c)); } return(m_collapsedCounts); }
/// <summary> /// translates the int value using the val list /// </summary> /// <param name="id"></param> /// <param name="valarray"></param> /// <returns></returns> public object[] GetRawData(int id, ITermValueList valarray) { // NOTE: Added Get() extension method call because // the default .NET behavior throws an exception if the // index is out of bounds, rather than returning null. int[] page = m_list.Get(id >> PAGEID_SHIFT); if (page == null) { return(EMPTY); } else { int val = page[id & SLOTID_MASK]; if (val >= 0) { return(new object[] { valarray.GetRawValue(val) }); } else if (val == MISSING) { return(EMPTY); } else { int num = (val & COUNT_MASK); val >>= VALIDX_SHIFT; // signed shift, remember this is a negative number object[] ret = new object[num]; for (int i = 0; i < num; i++) { ret[i] = valarray.GetRawValue(page[i - val]); } return(ret); } } }
public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory) { string field = string.Intern(fieldName); int maxDoc = reader.MaxDoc; if (orderArray == null) // we want to reuse the memory { orderArray = NewInstance(termCountSize, maxDoc); } else { orderArray.EnsureCapacity(maxDoc); // no need to fill to 0, we are reseting the data anyway } List<int> minIDList = new List<int>(); List<int> maxIDList = new List<int>(); List<int> freqList = new List<int>(); int length = maxDoc + 1; ITermValueList list = listFactory == null ? new TermStringList() : listFactory.CreateTermList(); TermDocs termDocs = reader.TermDocs(); TermEnum termEnum = reader.Terms(new Term(field)); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); //int df = 0; t++; try { do { Term term = termEnum.Term; if (term == null || string.CompareOrdinal(term.Field, field) != 0) break; if (t >= orderArray.MaxValue()) { throw new System.IO.IOException("maximum number of value cannot exceed: " + orderArray.MaxValue()); } // Alexey: well, we could get now more than one term per document. Effectively, we could build facet againsts tokenized field /*// we expect that there is at most one term per document if (t >= length) { throw new RuntimeException("there are more terms than " + "documents in field \"" + field + "\", but it's impossible to sort on " + "tokenized fields"); }*/ // store term text list.Add(term.Text); termDocs.Seek(termEnum); // freqList.add(termEnum.docFreq()); // doesn't take into account deldocs int minID = -1; int maxID = -1; int df = 0; if (termDocs.Next()) { df++; int docid = termDocs.Doc; orderArray.Add(docid, t); minID = docid; while (termDocs.Next()) { df++; docid = termDocs.Doc; orderArray.Add(docid, t); } maxID = docid; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); t++; } while (termEnum.Next()); } finally { termDocs.Dispose(); termEnum.Dispose(); } list.Seal(); this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); }
public override FacetDataCache Load(BoboSegmentReader reader) { TreeDictionary <object, List <int> > dataMap = null; List <int> docList = null; int nullMinId = -1; int nullMaxId = -1; int nullFreq = 0; int doc = -1; IBits liveDocs = reader.LiveDocs; for (int i = 0; i < reader.MaxDoc; ++i) { if (liveDocs != null && !liveDocs.Get(i)) { continue; } doc = i; object val = m_facetDataFetcher.Fetch(reader, doc); if (val == null) { if (nullMinId < 0) { nullMinId = doc; } nullMaxId = doc; ++nullFreq; continue; } if (dataMap == null) { // Initialize. if (val is long[]) { if (m_termListFactory == null) { m_termListFactory = new TermFixedLengthInt64ArrayListFactory( ((long[])val).Length); } dataMap = new TreeDictionary <object, List <int> >(new VirtualSimpleFacetHandlerInt16ArrayComparer()); } else if (val is IComparable) { dataMap = new TreeDictionary <object, List <int> >(); } else { dataMap = new TreeDictionary <object, List <int> >(new VirtualSimpleFacetHandlerObjectComparer()); } } if (dataMap.Contains(val)) { docList = dataMap[val]; } else { docList = null; } if (docList == null) { docList = new List <int>(); dataMap[val] = docList; } docList.Add(doc); } m_facetDataFetcher.Cleanup(reader); int maxDoc = reader.MaxDoc; int size = dataMap == null ? 1 : (dataMap.Count + 1); BigSegmentedArray order = new BigInt32Array(maxDoc); ITermValueList list = m_termListFactory == null ? new TermStringList(size) : m_termListFactory.CreateTermList(size); int[] freqs = new int[size]; int[] minIDs = new int[size]; int[] maxIDs = new int[size]; list.Add(null); freqs[0] = nullFreq; minIDs[0] = nullMinId; maxIDs[0] = nullMaxId; if (dataMap != null) { int i = 1; int?docId; foreach (var entry in dataMap) { list.Add(list.Format(entry.Key)); docList = entry.Value; freqs[i] = docList.Count; minIDs[i] = docList.Get(0, int.MinValue); while ((docId = docList.Poll(int.MinValue)) != int.MinValue) { doc = (int)docId; order.Add(doc, i); } maxIDs[i] = doc; ++i; } } list.Seal(); FacetDataCache dataCache = new FacetDataCache(order, list, freqs, minIDs, maxIDs, TermCountSize.Large); return(dataCache); }
public override FacetDataCache Load(BoboIndexReader reader) { int maxDoc = reader.MaxDoc; BigIntArray order = new BigIntArray(maxDoc); ITermValueList mterms = _termListFactory == null ? new TermStringList() : _termListFactory.CreateTermList(); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); TermDocs termDocs = null; TermEnum termEnum = null; int t = 0; // current term number mterms.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; try { termDocs = reader.TermDocs(); termEnum = reader.Terms(new Term(_indexFieldName, "")); do { if (termEnum == null) { break; } Term term = termEnum.Term; if (term == null || !_indexFieldName.Equals(term.Field)) { break; } // store term text // we expect that there is at most one term per document if (t > MAX_VAL_COUNT) { throw new IOException("maximum number of value cannot exceed: " + MAX_VAL_COUNT); } string val = term.Text; mterms.Add(val); int bit = (0x00000001 << (t - 1)); termDocs.Seek(termEnum); //freqList.add(termEnum.docFreq()); // removed because the df doesn't take into account the num of deletedDocs int df = 0; int minID = -1; int maxID = -1; if (termDocs.Next()) { df++; int docid = termDocs.Doc; order.Add(docid, order.Get(docid) | bit); minID = docid; while (termDocs.Next()) { df++; docid = termDocs.Doc; order.Add(docid, order.Get(docid) | bit); } maxID = docid; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); t++; } while (termEnum.Next()); } finally { try { if (termDocs != null) { termDocs.Dispose(); } } finally { if (termEnum != null) { termEnum.Dispose(); } } } mterms.Seal(); return(new FacetDataCache(order, mterms, freqList.ToArray(), minIDList.ToArray(), maxIDList.ToArray(), TermCountSize.Large)); }
/// <summary> /// loads multi-value facet data. This method uses a workarea to prepare loading. /// </summary> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <param name="listFactory"></param> /// <param name="workArea"></param> public virtual void Load(string fieldName, AtomicReader reader, TermListFactory listFactory, BoboSegmentReader.WorkArea workArea) { #if FEATURE_STRING_INTERN string field = string.Intern(fieldName); #else string field = fieldName; #endif int maxdoc = reader.MaxDoc; BigNestedInt32Array.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); ITermValueList list = (listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = GetNegativeValueCount(reader, field); int t = 1; // valid term id starts from 1 list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); m_overflow = false; Terms terms = reader.GetTerms(field); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); BytesRef text; while ((text = termsEnum.Next()) != null) { string strText = text.Utf8ToString(); list.Add(strText); Term term = new Term(field, strText); DocsEnum docsEnum = reader.GetTermDocsEnum(term); int df = 0; int minID = -1; int maxID = -1; int docID = -1; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; while ((docID = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS) { df++; if (!loader.Add(docID, valId)) { LogOverflow(fieldName); } minID = docID; bitset.FastSet(docID); while (docsEnum.NextDoc() != DocsEnum.NO_MORE_DOCS) { docID = docsEnum.DocID; df++; if (!loader.Add(docID, valId)) { LogOverflow(fieldName); } bitset.FastSet(docID); } maxID = docID; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); t++; } } list.Seal(); try { m_nestedArray.Load(maxdoc + 1, loader); } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.m_valArray = list; this.m_freqs = freqList.ToArray(); this.m_minIDs = minIDList.ToArray(); this.m_maxIDs = maxIDList.ToArray(); int doc = 0; while (doc < maxdoc && !m_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc < maxdoc) { this.m_minIDs[0] = doc; doc = maxdoc - 1; while (doc >= 0 && !m_nestedArray.Contains(doc, 0, true)) { --doc; } this.m_maxIDs[0] = doc; } this.m_freqs[0] = maxdoc - (int)bitset.Cardinality(); }
public DefaultFacetCountCollectorFieldAccessor(ITermValueList valList) { this.valList = valList; }
public override FacetDataCache Load(BoboSegmentReader reader) { int maxDoc = reader.MaxDoc; BigInt32Array order = new BigInt32Array(maxDoc); ITermValueList mterms = m_termListFactory == null ? new TermStringList() : m_termListFactory.CreateTermList(); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); int t = 0; // current term number mterms.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; Terms terms = reader.GetTerms(m_indexFieldName); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); BytesRef text; while ((text = termsEnum.Next()) != null) { // store term text // we expect that there is at most one term per document if (t > MAX_VAL_COUNT) { throw new IOException("maximum number of value cannot exceed: " + MAX_VAL_COUNT); } string val = text.Utf8ToString(); mterms.Add(val); int bit = (0x00000001 << (t - 1)); Term term = new Term(m_indexFieldName, val); DocsEnum docsEnum = reader.GetTermDocsEnum(term); //freqList.add(termEnum.docFreq()); // removed because the df doesn't take into account the // num of deletedDocs int df = 0; int minID = -1; int maxID = -1; int docID = -1; while ((docID = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS) { df++; order.Add(docID, order.Get(docID) | bit); minID = docID; while (docsEnum.NextDoc() != DocsEnum.NO_MORE_DOCS) { docID = docsEnum.DocID; df++; order.Add(docID, order.Get(docID) | bit); } maxID = docID; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); t++; } } mterms.Seal(); return(new FacetDataCache(order, mterms, freqList.ToArray(), minIDList.ToArray(), maxIDList.ToArray(), TermCountSize.Large)); }
public virtual void Load(string latFieldName, string lonFieldName, BoboIndexReader reader) { if (reader == null) { throw new ArgumentNullException("reader object is null"); } FacetDataCache latCache = (FacetDataCache)reader.GetFacetData(latFieldName); FacetDataCache lonCache = (FacetDataCache)reader.GetFacetData(lonFieldName); int maxDoc = reader.MaxDoc; BigFloatArray xVals = this._xValArray; BigFloatArray yVals = this._yValArray; BigFloatArray zVals = this._zValArray; if (xVals == null) { xVals = NewInstance(maxDoc); } else { xVals.EnsureCapacity(maxDoc); } if (yVals == null) { yVals = NewInstance(maxDoc); } else { yVals.EnsureCapacity(maxDoc); } if (zVals == null) { zVals = NewInstance(maxDoc); } else { zVals.EnsureCapacity(maxDoc); } this._xValArray = xVals; this._yValArray = yVals; this._zValArray = zVals; BigSegmentedArray latOrderArray = latCache.OrderArray; ITermValueList latValList = latCache.ValArray; BigSegmentedArray lonOrderArray = lonCache.OrderArray; ITermValueList lonValList = lonCache.ValArray; for (int i = 0; i < maxDoc; ++i) { string docLatString = latValList.Get(latOrderArray.Get(i)).Trim(); string docLonString = lonValList.Get(lonOrderArray.Get(i)).Trim(); float docLat = 0; if (docLatString.Length > 0) { docLat = float.Parse(docLatString); } float docLon = 0; if (docLonString.Length > 0) { docLon = float.Parse(docLonString); } float[] coords = GeoMatchUtil.GeoMatchCoordsFromDegrees(docLat, docLon); _xValArray.Add(i, coords[0]); _yValArray.Add(i, coords[1]); _zValArray.Add(i, coords[2]); } }
/// <summary> /// translates the int value using the val list /// </summary> /// <param name="id"></param> /// <param name="valarray"></param> /// <returns></returns> public object[] GetRawData(int id, ITermValueList valarray) { // NOTE: Added Get() extension method call because // the default .NET behavior throws an exception if the // index is out of bounds, rather than returning null. int[] page = _list.Get(id >> PAGEID_SHIFT); if (page == null) { return EMPTY; } else { int val = page[id & SLOTID_MASK]; if (val >= 0) { return new object[] { valarray.GetRawValue(val) }; } else if (val == MISSING) { return EMPTY; } else { int num = (val & COUNT_MASK); val >>= VALIDX_SHIFT; // signed shift, remember this is a negative number object[] ret = new object[num]; for (int i = 0; i < num; i++) { ret[i] = valarray.GetRawValue(page[i - val]); } return ret; } } }
/// <summary> * translates the int value using the val list </summary> /// * @param <T> </param> /// * <param name="array"> </param> /// * <param name="id"> </param> /// * <param name="valarray"> /// * @return </param> public object[] getRawData(int id, ITermValueList valarray) { int[] page = _list[id >> PAGEID_SHIFT]; if (page == null) { return EMPTY; } else { int val = page[id & SLOTID_MASK]; if (val >= 0) { return new object[] { valarray.GetRawValue(val) }; } else if (val == MISSING) { return EMPTY; } else { int num = (val & COUNT_MASK); val >>= VALIDX_SHIFT; // signed shift, remember this is a negative number object[] ret = new object[num]; for (int i = 0; i < num; i++) { ret[i] = valarray.GetRawValue(page[i - val]); } return ret; } } }
public static IEnumerable <BrowseFacet> GetFacets(FacetSpec ospec, BigSegmentedArray count, int countlength, ITermValueList valList) { if (ospec != null) { int minCount = ospec.MinHitCount; int max = ospec.MaxCount; if (max <= 0) { max = countlength; } LinkedList <BrowseFacet> facetColl; FacetSpec.FacetSortSpec sortspec = ospec.OrderBy; if (sortspec == FacetSpec.FacetSortSpec.OrderValueAsc) { facetColl = new LinkedList <BrowseFacet>(); for (int i = 1; i < countlength; ++i) // exclude zero { int hits = count.Get(i); if (hits >= minCount) { BrowseFacet facet = new BrowseFacet(valList.Get(i), hits); facetColl.AddLast(facet); } if (facetColl.Count >= max) { break; } } } else //if (sortspec == FacetSortSpec.OrderHitsDesc) { IComparatorFactory comparatorFactory; if (sortspec == FacetSpec.FacetSortSpec.OrderHitsDesc) { comparatorFactory = new FacetHitcountComparatorFactory(); } else { comparatorFactory = ospec.CustomComparatorFactory; } if (comparatorFactory == null) { throw new ArgumentException("facet comparator factory not specified"); } IComparer <int> comparator = comparatorFactory.NewComparator(new DefaultFacetCountCollectorFieldAccessor(valList), count); facetColl = new LinkedList <BrowseFacet>(); int forbidden = -1; IntBoundedPriorityQueue pq = new IntBoundedPriorityQueue(comparator, max, forbidden); for (int i = 1; i < countlength; ++i) // exclude zero { int hits = count.Get(i); if (hits >= minCount) { pq.Offer(i); } } int val; while ((val = pq.Poll()) != forbidden) { BrowseFacet facet = new BrowseFacet(valList[val], count.Get(val)); facetColl.AddFirst(facet); } } return(facetColl); } else { return(FacetCountCollector_Fields.EMPTY_FACET_LIST); } }
public static IEnumerable<BrowseFacet> GetFacets(FacetSpec ospec, BigSegmentedArray count, int countlength, ITermValueList valList) { if (ospec != null) { int minCount = ospec.MinHitCount; int max = ospec.MaxCount; if (max <= 0) max = countlength; LinkedList<BrowseFacet> facetColl; FacetSpec.FacetSortSpec sortspec = ospec.OrderBy; if (sortspec == FacetSpec.FacetSortSpec.OrderValueAsc) { facetColl = new LinkedList<BrowseFacet>(); for (int i = 1; i < countlength; ++i) // exclude zero { int hits = count.Get(i); if (hits >= minCount) { BrowseFacet facet = new BrowseFacet(valList.Get(i), hits); facetColl.AddLast(facet); } if (facetColl.Count >= max) break; } } else //if (sortspec == FacetSortSpec.OrderHitsDesc) { IComparatorFactory comparatorFactory; if (sortspec == FacetSpec.FacetSortSpec.OrderHitsDesc) { comparatorFactory = new FacetHitcountComparatorFactory(); } else { comparatorFactory = ospec.CustomComparatorFactory; } if (comparatorFactory == null) { throw new ArgumentException("facet comparator factory not specified"); } IComparer<int> comparator = comparatorFactory.NewComparator(new DefaultFacetCountCollectorFieldAccessor(valList), count); facetColl = new LinkedList<BrowseFacet>(); int forbidden = -1; IntBoundedPriorityQueue pq = new IntBoundedPriorityQueue(comparator, max, forbidden); for (int i = 1; i < countlength; ++i) // exclude zero { int hits = count.Get(i); if (hits >= minCount) { pq.Offer(i); } } int val; while ((val = pq.Poll()) != forbidden) { BrowseFacet facet = new BrowseFacet(valList[val], count.Get(val)); facetColl.AddFirst(facet); } } return facetColl; } else { return FacetCountCollector_Fields.EMPTY_FACET_LIST; } }
public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory) { string field = string.Intern(fieldName); int maxDoc = reader.MaxDoc; BigSegmentedArray order = this.orderArray; if (order == null) // we want to reuse the memory { int dictValueCount = GetDictValueCount(reader, fieldName); order = NewInstance(dictValueCount, maxDoc); } else { order.EnsureCapacity(maxDoc); // no need to fill to 0, we are reseting the // data anyway } this.orderArray = order; List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); int length = maxDoc + 1; ITermValueList list = listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList(); int negativeValueCount = GetNegativeValueCount(reader, field); TermDocs termDocs = reader.TermDocs(); TermEnum termEnum = reader.Terms(new Term(field, "")); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); int totalFreq = 0; //int df = 0; t++; try { do { Term term = termEnum.Term; if (term == null || string.CompareOrdinal(term.Field, field) != 0) { break; } // store term text // we expect that there is at most one term per document // Alexey: well, we could get now more than one term per document. Effectively, we could build facet against tokenized field //if (t >= length) //{ // throw new RuntimeException("there are more terms than " + "documents in field \"" + field // + "\", but it's impossible to sort on " + "tokenized fields"); //} list.Add(term.Text); termDocs.Seek(termEnum); // freqList.add(termEnum.docFreq()); // doesn't take into account deldocs int minID = -1; int maxID = -1; int df = 0; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; if (termDocs.Next()) { df++; int docid = termDocs.Doc; order.Add(docid, valId); minID = docid; while (termDocs.Next()) { df++; docid = termDocs.Doc; order.Add(docid, valId); } maxID = docid; } freqList.Add(df); totalFreq += df; minIDList.Add(minID); maxIDList.Add(maxID); t++; } while (termEnum.Next()); } finally { termDocs.Dispose(); termEnum.Dispose(); } list.Seal(); this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxDoc && order.Get(doc) != 0) { ++doc; } if (doc <= maxDoc) { this.minIDs[0] = doc; // Try to get the max doc = maxDoc; while (doc > 0 && order.Get(doc) != 0) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxDoc + 1 - totalFreq; }
public virtual void Load(string fieldName, AtomicReader reader, TermListFactory listFactory) { #if FEATURE_STRING_INTERN string field = string.Intern(fieldName); #else string field = fieldName; #endif int maxDoc = reader.MaxDoc; int dictValueCount = GetDictValueCount(reader, fieldName); BigSegmentedArray order = NewInstance(dictValueCount, maxDoc); this.m_orderArray = order; List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); int length = maxDoc + 1; ITermValueList list = listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList(); int negativeValueCount = GetNegativeValueCount(reader, field); int t = 1; // valid term id starts from 1 list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); int totalFreq = 0; Terms terms = reader.GetTerms(field); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); BytesRef text; while ((text = termsEnum.Next()) != null) { // store term text // we expect that there is at most one term per document if (t >= length) { throw new RuntimeException("there are more terms than " + "documents in field \"" + field + "\", but it's impossible to sort on " + "tokenized fields"); } string strText = text.Utf8ToString(); list.Add(strText); Term term = new Term(field, strText); DocsEnum docsEnum = reader.GetTermDocsEnum(term); // freqList.add(termEnum.docFreq()); // doesn't take into account // deldocs int minID = -1; int maxID = -1; int docID = -1; int df = 0; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; while ((docID = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS) { df++; order.Add(docID, valId); minID = docID; while (docsEnum.NextDoc() != DocsEnum.NO_MORE_DOCS) { docID = docsEnum.DocID; df++; order.Add(docID, valId); } maxID = docID; } freqList.Add(df); totalFreq += df; minIDList.Add(minID); maxIDList.Add(maxID); t++; } } list.Seal(); this.m_valArray = list; this.m_freqs = freqList.ToArray(); this.m_minIDs = minIDList.ToArray(); this.m_maxIDs = maxIDList.ToArray(); int doc = 0; while (doc < maxDoc && order.Get(doc) != 0) { ++doc; } if (doc < maxDoc) { this.m_minIDs[0] = doc; // Try to get the max doc = maxDoc - 1; while (doc >= 0 && order.Get(doc) != 0) { --doc; } this.m_maxIDs[0] = doc; } this.m_freqs[0] = reader.NumDocs - totalFreq; }
/// <summary> /// loads multi-value facet data. This method uses a workarea to prepare loading. /// </summary> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <param name="listFactory"></param> /// <param name="workArea"></param> public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea) { long t0 = Environment.TickCount; int maxdoc = reader.MaxDoc; BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); TermEnum tenum = null; TermDocs tdoc = null; ITermValueList list = (listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(); int negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName)); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; _overflow = false; try { tdoc = reader.TermDocs(); tenum = reader.Terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.Term; if (term == null || !fieldName.Equals(term.Field)) { break; } string val = term.Text; if (val != null) { list.Add(val); tdoc.Seek(tenum); //freqList.add(tenum.docFreq()); // removed because the df doesn't take into account the num of deletedDocs int df = 0; int minID = -1; int maxID = -1; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; if (tdoc.Next()) { df++; int docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } minID = docid; bitset.Set(docid); while (tdoc.Next()) { df++; docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } bitset.Set(docid); } maxID = docid; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } t++; }while (tenum.Next()); } } finally { try { if (tdoc != null) { tdoc.Dispose(); } } finally { if (tenum != null) { tenum.Dispose(); } } } list.Seal(); try { _nestedArray.Load(maxdoc + 1, loader); } catch (System.IO.IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.Contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality(); }
public override FacetDataCache Load(BoboIndexReader reader) { int doc = -1; C5.TreeDictionary <object, List <int> > dataMap = null; List <int> docList = null; int nullMinId = -1; int nullMaxId = -1; int nullFreq = 0; TermDocs termDocs = reader.TermDocs(null); try { while (termDocs.Next()) { doc = termDocs.Doc; object val = _facetDataFetcher.Fetch(reader, doc); if (val == null) { if (nullMinId < 0) { nullMinId = doc; } nullMaxId = doc; ++nullFreq; continue; } if (dataMap == null) { // Initialize. if (val is long[]) { if (_termListFactory == null) { _termListFactory = new TermFixedLengthLongArrayListFactory( ((long[])val).Length); } dataMap = new C5.TreeDictionary <object, List <int> >(new VirtualSimpleFacetHandlerLongArrayComparator()); } else if (val is IComparable) { // NOTE: In .NET 3.5, the default constructor doesn't work in this case. We therefore have a custom type // that converts the objects to IComparable before comparing them, falling back to a string comparison // if they don't convert. This differs from the Java implementation that uses the default constructor. dataMap = new C5.TreeDictionary <object, List <int> >(new VirtualSimpleFacetHandlerComparableComparator()); } else { dataMap = new C5.TreeDictionary <object, List <int> >(new VirtualSimpleFacetHandlerObjectComparator()); } } if (dataMap.Contains(val)) { docList = dataMap[val]; } else { docList = null; } if (docList == null) { docList = new List <int>(); dataMap[val] = docList; } docList.Add(doc); } } finally { termDocs.Dispose(); } _facetDataFetcher.Cleanup(reader); int maxDoc = reader.MaxDoc; int size = dataMap == null ? 1 : (dataMap.Count + 1); BigSegmentedArray order = new BigIntArray(maxDoc); ITermValueList list = _termListFactory == null ? new TermStringList(size) : _termListFactory.CreateTermList(size); int[] freqs = new int[size]; int[] minIDs = new int[size]; int[] maxIDs = new int[size]; list.Add(null); freqs[0] = nullFreq; minIDs[0] = nullMinId; maxIDs[0] = nullMaxId; if (dataMap != null) { int i = 1; int?docId; foreach (var entry in dataMap) { list.Add(list.Format(entry.Key)); docList = entry.Value; freqs[i] = docList.Count; minIDs[i] = docList.Get(0, int.MinValue); while ((docId = docList.Poll(int.MinValue)) != int.MinValue) { doc = (int)docId; order.Add(doc, i); } maxIDs[i] = doc; ++i; } } list.Seal(); FacetDataCache dataCache = new FacetDataCache(order, list, freqs, minIDs, maxIDs, TermCountSize.Large); return(dataCache); }