private BigSegmentedArray GetCollapsedCounts() { if (m_collapsedCounts == null) { m_collapsedCounts = new LazyBigInt32Array(m_bucketValues.Count); FacetDataCache dataCache = m_subCollector.DataCache; ITermValueList subList = dataCache.ValArray; BigSegmentedArray subcounts = m_subCollector.Count; FixedBitSet indexSet = new FixedBitSet(subcounts.Length); int c = 0; int i = 0; foreach (string val in m_bucketValues) { if (val.Length > 0) { string[] subVals = m_predefinedBuckets.Get(val); int count = 0; foreach (string subVal in subVals) { int index = subList.IndexOf(subVal); if (index > 0) { int subcount = subcounts.Get(index); count += subcount; if (!indexSet.Get(index)) { indexSet.Set(index); c += dataCache.Freqs[index]; } } } m_collapsedCounts.Add(i, count); } i++; } m_collapsedCounts.Add(0, (m_numdocs - c)); } return(m_collapsedCounts); }
private void Aggregate() { if (m_isAggregated) { return; } m_isAggregated = true; int startIdx = m_valArray.IndexOf(m_start); if (startIdx < 0) { startIdx = -(startIdx + 1); } int endIdx = m_valArray.IndexOf(m_end); if (endIdx < 0) { endIdx = -(endIdx + 1); } BigSegmentedArray baseCounts = m_baseCollector.GetCountDistribution(); if (m_start is long) { long start = Convert.ToInt64(m_start); long unit = Convert.ToInt64(m_unit); TermInt64List valArray = (TermInt64List)m_valArray; for (int i = startIdx; i < endIdx; i++) { long val = valArray.GetPrimitiveValue(i); int idx = (int)((val - start) / unit); if (idx >= 0 && idx < m_count.Length) { m_count.Add(idx, m_count.Get(idx) + baseCounts.Get(i)); } } } else if (m_start is int) { int start = Convert.ToInt32(m_start); int unit = Convert.ToInt32(m_unit); TermInt32List valArray = (TermInt32List)m_valArray; for (int i = startIdx; i < endIdx; i++) { int val = valArray.GetPrimitiveValue(i); int idx = ((val - start) / unit); if (idx >= 0 && idx < m_count.Length) { m_count.Add(idx, m_count.Get(idx) + baseCounts.Get(i)); } } } else { double start = Convert.ToDouble(m_start); double unit = Convert.ToDouble(m_unit); for (int i = startIdx; i < endIdx; i++) { double val = (double)m_valArray.GetRawValue(i); int idx = (int)((val - start) / unit); if (idx >= 0 && idx < m_count.Length) { m_count.Add(idx, m_count.Get(idx) + baseCounts.Get(i)); } } } }
public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory) { string field = string.Intern(fieldName); int maxDoc = reader.MaxDoc; BigSegmentedArray order = this.orderArray; if (order == null) // we want to reuse the memory { int dictValueCount = GetDictValueCount(reader, fieldName); order = NewInstance(dictValueCount, maxDoc); } else { order.EnsureCapacity(maxDoc); // no need to fill to 0, we are reseting the // data anyway } this.orderArray = order; List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); int length = maxDoc + 1; ITermValueList list = listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList(); int negativeValueCount = GetNegativeValueCount(reader, field); TermDocs termDocs = reader.TermDocs(); TermEnum termEnum = reader.Terms(new Term(field, "")); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); int totalFreq = 0; //int df = 0; t++; try { do { Term term = termEnum.Term; if (term == null || string.CompareOrdinal(term.Field, field) != 0) { break; } // store term text // we expect that there is at most one term per document // Alexey: well, we could get now more than one term per document. Effectively, we could build facet against tokenized field //if (t >= length) //{ // throw new RuntimeException("there are more terms than " + "documents in field \"" + field // + "\", but it's impossible to sort on " + "tokenized fields"); //} list.Add(term.Text); termDocs.Seek(termEnum); // freqList.add(termEnum.docFreq()); // doesn't take into account deldocs int minID = -1; int maxID = -1; int df = 0; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; if (termDocs.Next()) { df++; int docid = termDocs.Doc; order.Add(docid, valId); minID = docid; while (termDocs.Next()) { df++; docid = termDocs.Doc; order.Add(docid, valId); } maxID = docid; } freqList.Add(df); totalFreq += df; minIDList.Add(minID); maxIDList.Add(maxID); t++; } while (termEnum.Next()); } finally { termDocs.Dispose(); termEnum.Dispose(); } list.Seal(); this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxDoc && order.Get(doc) != 0) { ++doc; } if (doc <= maxDoc) { this.minIDs[0] = doc; // Try to get the max doc = maxDoc; while (doc > 0 && order.Get(doc) != 0) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxDoc + 1 - totalFreq; }
public virtual void Collect(int docid) { int i = m_orderArray.Get(docid); m_count.Add(i, m_count.Get(i) + 1); }
public virtual void Load(string fieldName, AtomicReader reader, TermListFactory listFactory) { #if FEATURE_STRING_INTERN string field = string.Intern(fieldName); #else string field = fieldName; #endif int maxDoc = reader.MaxDoc; int dictValueCount = GetDictValueCount(reader, fieldName); BigSegmentedArray order = NewInstance(dictValueCount, maxDoc); this.m_orderArray = order; List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); int length = maxDoc + 1; ITermValueList list = listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList(); int negativeValueCount = GetNegativeValueCount(reader, field); int t = 1; // valid term id starts from 1 list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); int totalFreq = 0; Terms terms = reader.GetTerms(field); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); BytesRef text; while ((text = termsEnum.Next()) != null) { // store term text // we expect that there is at most one term per document if (t >= length) { throw new RuntimeException("there are more terms than " + "documents in field \"" + field + "\", but it's impossible to sort on " + "tokenized fields"); } string strText = text.Utf8ToString(); list.Add(strText); Term term = new Term(field, strText); DocsEnum docsEnum = reader.GetTermDocsEnum(term); // freqList.add(termEnum.docFreq()); // doesn't take into account // deldocs int minID = -1; int maxID = -1; int docID = -1; int df = 0; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; while ((docID = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS) { df++; order.Add(docID, valId); minID = docID; while (docsEnum.NextDoc() != DocsEnum.NO_MORE_DOCS) { docID = docsEnum.DocID; df++; order.Add(docID, valId); } maxID = docID; } freqList.Add(df); totalFreq += df; minIDList.Add(minID); maxIDList.Add(maxID); t++; } } list.Seal(); this.m_valArray = list; this.m_freqs = freqList.ToArray(); this.m_minIDs = minIDList.ToArray(); this.m_maxIDs = maxIDList.ToArray(); int doc = 0; while (doc < maxDoc && order.Get(doc) != 0) { ++doc; } if (doc < maxDoc) { this.m_minIDs[0] = doc; // Try to get the max doc = maxDoc - 1; while (doc >= 0 && order.Get(doc) != 0) { --doc; } this.m_maxIDs[0] = doc; } this.m_freqs[0] = reader.NumDocs - totalFreq; }
/// <summary> /// /// </summary> /// <param name="docid">The docid for which the facet counts are to be calculated</param> public virtual void Collect(int docid) { float docX = m_xvals.Get(docid); float docY = m_yvals.Get(docid); float docZ = m_zvals.Get(docid); float radius, targetX, targetY, targetZ, delta; float xu, xl, yu, yl, zu, zl; int countIndex = -1; foreach (GeoRange range in m_ranges) { // the countIndex for the count array should increment with the range index of the _ranges array countIndex++; if (m_miles) { radius = GeoMatchUtil.GetMilesRadiusCosine(range.Rad); } else { radius = GeoMatchUtil.GetKMRadiusCosine(range.Rad); } float[] coords = GeoMatchUtil.GeoMatchCoordsFromDegrees(range.Lat, range.Lon); targetX = coords[0]; targetY = coords[1]; targetZ = coords[2]; if (m_miles) { delta = (float)(range.Rad / GeoMatchUtil.EARTH_RADIUS_MILES); } else { delta = (float)(range.Rad / GeoMatchUtil.EARTH_RADIUS_KM); } xu = targetX + delta; xl = targetX - delta; // try to see if the range checks can short circuit the actual inCircle check if (docX > xu || docX < xl) { continue; } yu = targetY + delta; yl = targetY - delta; if (docY > yu || docY < yl) { continue; } zu = targetZ + delta; zl = targetZ - delta; if (docZ > zu || docZ < zl) { continue; } if (GeoFacetFilter.InCircle(docX, docY, docZ, targetX, targetY, targetZ, radius)) { // if the lat, lon values of this docid match the current user-specified range, then increment the // appropriate count[] value m_count.Add(countIndex, m_count.Get(countIndex) + 1); // do not break here, since one document could lie in multiple user-specified ranges } } }
public virtual void Collect(int docid) { int i = _array.Get(docid); _count.Add(i, _count.Get(i) + 1); }