public void MissingTerms_Test() { string fieldName = "field1"; RAMDirectory rd = new RAMDirectory(); IndexWriter w = new IndexWriter(rd, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); for (int i = 0; i < 100; i++) { Document doc = new Document(); int term = i * 10; //terms are units of 10; doc.Add(new Field(fieldName, "" + term, Field.Store.YES, Field.Index.NOT_ANALYZED)); w.AddDocument(doc); } w.Close(); IndexReader reader = IndexReader.Open(rd, true); TermsFilter tf = new TermsFilter(); tf.AddTerm(new Term(fieldName, "19")); OpenBitSet bits = (OpenBitSet)tf.GetDocIdSet(reader); Assert.AreEqual(0, bits.Cardinality(), "Must match nothing"); tf.AddTerm(new Term(fieldName, "20")); bits = (OpenBitSet)tf.GetDocIdSet(reader); Assert.AreEqual(1, bits.Cardinality(), "Must match 1"); tf.AddTerm(new Term(fieldName, "10")); bits = (OpenBitSet)tf.GetDocIdSet(reader); Assert.AreEqual(2, bits.Cardinality(), "Must match 2"); tf.AddTerm(new Term(fieldName, "00")); bits = (OpenBitSet)tf.GetDocIdSet(reader); Assert.AreEqual(2, bits.Cardinality(), "Must match 2"); }
public override RandomAccessDocIdSet GetRandomAccessDocIdSet(BoboSegmentReader reader) { MultiValueFacetDataCache dataCache = m_facetHandler.GetFacetData <MultiValueFacetDataCache>(reader); int[] index = m_valueConverter.Convert(dataCache, m_vals); //BigNestedIntArray nestedArray = dataCache.NestedArray; OpenBitSet bitset = new OpenBitSet(dataCache.ValArray.Count); foreach (int i in index) { bitset.FastSet(i); } if (m_takeCompliment) { // flip the bits int size = dataCache.ValArray.Count; for (int i = 0; i < size; ++i) { bitset.FastFlip(i); } } long count = bitset.Cardinality(); if (count == 0) { return(new EmptyRandomAccessDocIdSet()); } else { return(new MultiRandomAccessDocIdSet(dataCache, bitset)); } }
protected virtual void Condense(float[] floats) { if (floats.Length != m_capacity) { throw new ArgumentException("bad input float array of length " + floats.Length + " for capacity: " + m_capacity); } var bits = new OpenBitSet(floats.Length); int on = 0; for (int i = 0; i < floats.Length; i++) { if (floats[i] != 0f) { bits.Set(i); on++; } } if (((float)on) / ((float)floats.Length) < ON_RATIO_CUTOFF) { // it's worth compressing if (0 == on) { // it's worth super-compressing m_floats = null; m_bits = null; m_referencePoints = null; // capacity is good. } else { m_bits = bits; m_floats = new float[m_bits.Cardinality()]; m_referencePoints = new int[floats.Length / REFERENCE_POINT_EVERY]; int i = 0; int floatsIdx = 0; int refIdx = 0; while (i < floats.Length && (i = m_bits.NextSetBit(i)) >= 0) { m_floats[floatsIdx] = floats[i]; while (refIdx < i / REFERENCE_POINT_EVERY) { m_referencePoints[refIdx++] = floatsIdx; } floatsIdx++; i++; } while (refIdx < m_referencePoints.Length) { m_referencePoints[refIdx++] = floatsIdx; } } } else { // it's not worth compressing m_floats = floats; m_bits = null; } }
/// <summary> /// Calculate similarity score between a pair of FingerprintMxs /// </summary> /// <param name="fp1"></param> /// <param name="fp2"></param> /// <returns></returns> public static float CalculateFingerprintPairSimilarityScore( FingerprintMx fp1, FingerprintMx fp2) { long[] fp1Array = fp1.ToLongArray(); OpenBitSet fp1BitSet = new OpenBitSet(fp1Array, fp1Array.Length); int fp1Card = (int)fp1BitSet.Cardinality(); long[] fp2Array = fp2.ToLongArray(); OpenBitSet fp2BitSet = new OpenBitSet(fp2Array, fp2Array.Length); int fp2Card = (int)fp2BitSet.Cardinality(); fp2BitSet.Intersect(fp1BitSet); int commonCnt = (int)fp2BitSet.Cardinality(); float simScore = commonCnt / (float)(fp1Card + fp2Card - commonCnt); return(simScore); }
public override RandomAccessDocIdSet GetRandomAccessDocIdSet(BoboSegmentReader reader) { FacetDataCache dataCache = m_facetDataCacheBuilder.Build(reader); OpenBitSet openBitSet = GetBitSet(dataCache); long count = openBitSet.Cardinality(); if (count == 0) { return(EmptyDocIdSet.Instance); } else { bool multi = dataCache is MultiValueFacetDataCache; MultiValueFacetDataCache multiCache = multi ? (MultiValueFacetDataCache)dataCache : null; return(new BitSetRandomAccessDocIdSet(multi, multiCache, openBitSet, dataCache)); } }
public virtual void Write(FieldsConsumer consumer) { Array.Sort(terms); TermsConsumer termsConsumer = consumer.AddField(fieldInfo); long sumTotalTermCount = 0; long sumDF = 0; OpenBitSet visitedDocs = new OpenBitSet(); foreach (TermData term in terms) { for (int i = 0; i < term.docs.Length; i++) { visitedDocs.Set(term.docs[i]); } sumDF += term.docs.Length; sumTotalTermCount += term.Write(termsConsumer); } termsConsumer.Finish(omitTF ? -1 : sumTotalTermCount, sumDF, (int)visitedDocs.Cardinality()); }
public override void Finish(long sumTotalTermFreq, long sumDocFreq, int docCount) { Debug.Assert(state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0); state = TermsConsumerState.FINISHED; Debug.Assert(docCount >= 0); Debug.Assert(docCount == visitedDocs.Cardinality()); Debug.Assert(sumDocFreq >= docCount); Debug.Assert(sumDocFreq == this.sumDocFreq); if (fieldInfo.IndexOptions == IndexOptions.DOCS_ONLY) { Debug.Assert(sumTotalTermFreq == -1); } else { Debug.Assert(sumTotalTermFreq >= sumDocFreq); Debug.Assert(sumTotalTermFreq == this.sumTotalTermFreq); } @in.Finish(sumTotalTermFreq, sumDocFreq, docCount); }
//Compares a BitArray with an OpenBitSet public static bool Equal(this BitArray a, OpenBitSet b) { var bitArrayCardinality = a.Cardinality(); if (bitArrayCardinality != b.Cardinality()) { return(false); } for (int i = 0; i < bitArrayCardinality; i++) { if (a.SafeGet(i) != b.Get(i)) { return(false); } } return(true); }
internal void Calculate() { if (_QueryDocidSet == DocIdBitSet.EMPTY_DOCIDSET) { _ResultBitSet = new OpenBitSet(0); } else { _ResultBitSet = (OpenBitSet)((OpenBitSet)_QueryDocidSet).Clone(); _ResultBitSet.And(_GroupBitSet); } _ResultIterator = _ResultBitSet.Iterator(); _HitCount = _ResultBitSet.Cardinality(); _ResultBitSet = null; _QueryDocidSet = null; _GroupBitSet = null; }
private void DoTestMultiThreads(bool withTimeout) { ThreadClass[] threadArray = new ThreadClass[N_THREADS]; OpenBitSet success = new OpenBitSet(N_THREADS); for (int i = 0; i < threadArray.Length; ++i) { int num = i; threadArray[num] = new ThreadClassAnonymousHelper(this, success, withTimeout, num); } for (int i = 0; i < threadArray.Length; ++i) { threadArray[i].Start(); } for (int i = 0; i < threadArray.Length; ++i) { threadArray[i].Join(); } assertEquals("some threads failed!", N_THREADS, success.Cardinality()); }
/// <summary> /// loads multi-value facet data. This method uses a workarea to prepare loading. /// </summary> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <param name="listFactory"></param> /// <param name="workArea"></param> public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea) { long t0 = Environment.TickCount; int maxdoc = reader.MaxDoc; BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); TermEnum tenum = null; TermDocs tdoc = null; ITermValueList list = (listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(); int negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName)); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; _overflow = false; try { tdoc = reader.TermDocs(); tenum = reader.Terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.Term; if (term == null || !fieldName.Equals(term.Field)) { break; } string val = term.Text; if (val != null) { list.Add(val); tdoc.Seek(tenum); //freqList.add(tenum.docFreq()); // removed because the df doesn't take into account the num of deletedDocs int df = 0; int minID = -1; int maxID = -1; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; if (tdoc.Next()) { df++; int docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } minID = docid; bitset.Set(docid); while (tdoc.Next()) { df++; docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } bitset.Set(docid); } maxID = docid; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } t++; }while (tenum.Next()); } } finally { try { if (tdoc != null) { tdoc.Dispose(); } } finally { if (tenum != null) { tenum.Dispose(); } } } list.Seal(); try { _nestedArray.Load(maxdoc + 1, loader); } catch (System.IO.IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.Contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality(); }
public long Count() { return(_openBitSet.Cardinality()); }
public override int Size() { return((int)bitSet.Cardinality()); }
/// <summary> /// loads multi-value facet data. This method uses a workarea to prepare loading. /// </summary> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <param name="listFactory"></param> /// <param name="workArea"></param> public virtual void Load(string fieldName, AtomicReader reader, TermListFactory listFactory, BoboSegmentReader.WorkArea workArea) { #if FEATURE_STRING_INTERN string field = string.Intern(fieldName); #else string field = fieldName; #endif int maxdoc = reader.MaxDoc; BigNestedInt32Array.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); ITermValueList list = (listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = GetNegativeValueCount(reader, field); int t = 1; // valid term id starts from 1 list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); m_overflow = false; Terms terms = reader.GetTerms(field); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); BytesRef text; while ((text = termsEnum.Next()) != null) { string strText = text.Utf8ToString(); list.Add(strText); Term term = new Term(field, strText); DocsEnum docsEnum = reader.GetTermDocsEnum(term); int df = 0; int minID = -1; int maxID = -1; int docID = -1; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; while ((docID = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS) { df++; if (!loader.Add(docID, valId)) { LogOverflow(fieldName); } minID = docID; bitset.FastSet(docID); while (docsEnum.NextDoc() != DocsEnum.NO_MORE_DOCS) { docID = docsEnum.DocID; df++; if (!loader.Add(docID, valId)) { LogOverflow(fieldName); } bitset.FastSet(docID); } maxID = docID; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); t++; } } list.Seal(); try { m_nestedArray.Load(maxdoc + 1, loader); } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.m_valArray = list; this.m_freqs = freqList.ToArray(); this.m_minIDs = minIDList.ToArray(); this.m_maxIDs = maxIDList.ToArray(); int doc = 0; while (doc < maxdoc && !m_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc < maxdoc) { this.m_minIDs[0] = doc; doc = maxdoc - 1; while (doc >= 0 && !m_nestedArray.Contains(doc, 0, true)) { --doc; } this.m_maxIDs[0] = doc; } this.m_freqs[0] = maxdoc - (int)bitset.Cardinality(); }
public override void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea) { long t0 = System.Environment.TickCount; int maxdoc = reader.MaxDoc; BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); BigNestedIntArray.BufferedLoader weightLoader = GetBufferedLoader(maxdoc, null); TermEnum tenum = null; TermDocs tdoc = null; var list = (listFactory == null ? new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName)); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; _overflow = false; string pre = null; int df = 0; int minID = -1; int maxID = -1; int valId = 0; try { tdoc = reader.TermDocs(); tenum = reader.Terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.Term; if (term == null || !fieldName.Equals(term.Field)) { break; } string val = term.Text; if (val != null) { int weight = 0; string[] split = val.Split(new char[] { '\0' }, StringSplitOptions.RemoveEmptyEntries); if (split.Length > 1) { val = split[0]; weight = int.Parse(split[split.Length - 1]); } if (pre == null || !val.Equals(pre)) { if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } list.Add(val); df = 0; minID = -1; maxID = -1; valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; t++; } tdoc.Seek(tenum); if (tdoc.Next()) { df++; int docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } else { weightLoader.Add(docid, weight); } if (docid < minID) { minID = docid; } bitset.FastSet(docid); while (tdoc.Next()) { df++; docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } else { weightLoader.Add(docid, weight); } bitset.FastSet(docid); } if (docid > maxID) { maxID = docid; } } pre = val; } }while (tenum.Next()); if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } } } finally { try { if (tdoc != null) { tdoc.Dispose(); } } finally { if (tenum != null) { tenum.Dispose(); } } } list.Seal(); try { _nestedArray.Load(maxdoc + 1, loader); _weightArray.Load(maxdoc + 1, weightLoader); } catch (System.IO.IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.Contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality(); }
/// <summary> /// Search a single file /// </summary> void SearchSingleFile(int fi) { StructSearchMatch sm = null; AssertMx.IsNotNull(FpDao, "FpDao"); List <StructSearchMatch> matchList = FileMatchLists[fi]; AssertMx.IsNotNull(matchList, "matchList"); OpenBitSet queryObs = new OpenBitSet(QueryFpLongArray, QueryFpLongArray.Length); AssertMx.IsNotNull(queryObs, "queryObs"); OpenBitSet dbObs = new OpenBitSet(QueryFpLongArray, QueryFpLongArray.Length); // gets set to DB fp for intersect AssertMx.IsNotNull(dbObs, "dbObs"); FileStream fs = FileStreamReaders[fi]; AssertMx.IsNotNull(fs, "fs"); ReadFingerprintRecArgs a = new ReadFingerprintRecArgs(); a.Initialize(fs, QueryFpLongArray.Length); try { while (true) { bool readOk = FpDao.ReadRawFingerprintRec(a); if (!readOk) { break; } //if (IsSrcCidMatch("03435269", a)) a = a; // debug dbObs.Bits = a.fingerprint; dbObs.Intersect(queryObs); int commonCnt = (int)dbObs.Cardinality(); float simScore = commonCnt / (float)(a.cardinality + QueryFpCardinality - commonCnt); if (simScore >= MinimumSimilarity) { sm = ReadFingerprintRec_To_StructSearchMatch(a); sm.SearchType = StructureSearchType.MolSim; sm.MatchScore = simScore; matchList.Add(sm); } } } catch (Exception ex) { string msg = ex.Message; msg += string.Format("\r\nfi: {0}, fs.Name: {1}, sm: {2}", fi, fs.Name, sm != null ? sm.Serialize() : ""); DebugLog.Message(DebugLog.FormatExceptionMessage(ex, msg)); throw new Exception(msg, ex); } return; }
public int HitCount() { return((int)bits.Cardinality()); }
public override void Load(string fieldName, AtomicReader reader, TermListFactory listFactory, BoboSegmentReader.WorkArea workArea) { #if FEATURE_STRING_INTERN string field = string.Intern(fieldName); #else string field = fieldName; #endif int maxdoc = reader.MaxDoc; BigNestedInt32Array.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); BigNestedInt32Array.BufferedLoader weightLoader = GetBufferedLoader(maxdoc, null); var list = (listFactory == null ? new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = GetNegativeValueCount(reader, field); int t = 1; // valid term id starts from 1 list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); m_overflow = false; string pre = null; int df = 0; int minID = -1; int maxID = -1; int docID = -1; int valId = 0; Terms terms = reader.GetTerms(field); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); BytesRef text; while ((text = termsEnum.Next()) != null) { string strText = text.Utf8ToString(); string val = null; int weight = 0; string[] split = strText.Split(new char[] { '\0' }, StringSplitOptions.RemoveEmptyEntries); if (split.Length > 1) { val = split[0]; weight = int.Parse(split[split.Length - 1]); } else { continue; } if (pre == null || !val.Equals(pre)) { if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } list.Add(val); df = 0; minID = -1; maxID = -1; valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; t++; } Term term = new Term(field, strText); DocsEnum docsEnum = reader.GetTermDocsEnum(term); if (docsEnum != null) { while ((docID = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS) { df++; if (!loader.Add(docID, valId)) { LogOverflow(fieldName); } else { weightLoader.Add(docID, weight); } if (docID < minID) { minID = docID; } bitset.FastSet(docID); while (docsEnum.NextDoc() != DocsEnum.NO_MORE_DOCS) { docID = docsEnum.DocID; df++; if (!loader.Add(docID, valId)) { LogOverflow(fieldName); } else { weightLoader.Add(docID, weight); } bitset.FastSet(docID); } if (docID > maxID) { maxID = docID; } } } pre = val; } if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } } list.Seal(); try { m_nestedArray.Load(maxdoc + 1, loader); m_weightArray.Load(maxdoc + 1, weightLoader); } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.m_valArray = list; this.m_freqs = freqList.ToArray(); this.m_minIDs = minIDList.ToArray(); this.m_maxIDs = maxIDList.ToArray(); int doc = 0; while (doc < maxdoc && !m_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc < maxdoc) { this.m_minIDs[0] = doc; doc = maxdoc - 1; while (doc >= 0 && !m_nestedArray.Contains(doc, 0, true)) { --doc; } this.m_maxIDs[0] = doc; } this.m_freqs[0] = maxdoc - (int)bitset.Cardinality(); }