public override void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea) { long t0 = System.Environment.TickCount; int maxdoc = reader.MaxDoc; BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); BigNestedIntArray.BufferedLoader weightLoader = GetBufferedLoader(maxdoc, null); TermEnum tenum = null; TermDocs tdoc = null; var list = (listFactory == null ? new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName)); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; _overflow = false; string pre = null; int df = 0; int minID = -1; int maxID = -1; int valId = 0; try { tdoc = reader.TermDocs(); tenum = reader.Terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.Term; if (term == null || !fieldName.Equals(term.Field)) { break; } string val = term.Text; if (val != null) { int weight = 0; string[] split = val.Split(new char[] { '\0' }, StringSplitOptions.RemoveEmptyEntries); if (split.Length > 1) { val = split[0]; weight = int.Parse(split[split.Length - 1]); } if (pre == null || !val.Equals(pre)) { if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } list.Add(val); df = 0; minID = -1; maxID = -1; valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; t++; } tdoc.Seek(tenum); if (tdoc.Next()) { df++; int docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } else { weightLoader.Add(docid, weight); } if (docid < minID) { minID = docid; } bitset.FastSet(docid); while (tdoc.Next()) { df++; docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } else { weightLoader.Add(docid, weight); } bitset.FastSet(docid); } if (docid > maxID) { maxID = docid; } } pre = val; } }while (tenum.Next()); if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } } } finally { try { if (tdoc != null) { tdoc.Dispose(); } } finally { if (tenum != null) { tenum.Dispose(); } } } list.Seal(); try { _nestedArray.Load(maxdoc + 1, loader); _weightArray.Load(maxdoc + 1, weightLoader); } catch (System.IO.IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.Contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality(); }
/// <summary> /// Initializes a new instance of <see cref="SearchBit"/>. /// </summary> /// <param name="openBitSet">The open bit set.</param> public SearchBit(OpenBitSet openBitSet) { Prevent.ParameterNull(openBitSet, nameof(openBitSet)); _openBitSet = openBitSet; }
internal AssertingPostingsConsumer(PostingsConsumer @in, FieldInfo fieldInfo, OpenBitSet visitedDocs) { this.@in = @in; this.fieldInfo = fieldInfo; this.visitedDocs = visitedDocs; }
/// <summary> /// Search a single file /// </summary> void SearchSingleFile(int fi) { StructSearchMatch sm = null; AssertMx.IsNotNull(FpDao, "FpDao"); List <StructSearchMatch> matchList = FileMatchLists[fi]; AssertMx.IsNotNull(matchList, "matchList"); OpenBitSet queryObs = new OpenBitSet(QueryFpLongArray, QueryFpLongArray.Length); AssertMx.IsNotNull(queryObs, "queryObs"); OpenBitSet dbObs = new OpenBitSet(QueryFpLongArray, QueryFpLongArray.Length); // gets set to DB fp for intersect AssertMx.IsNotNull(dbObs, "dbObs"); FileStream fs = FileStreamReaders[fi]; AssertMx.IsNotNull(fs, "fs"); ReadFingerprintRecArgs a = new ReadFingerprintRecArgs(); a.Initialize(fs, QueryFpLongArray.Length); try { while (true) { bool readOk = FpDao.ReadRawFingerprintRec(a); if (!readOk) { break; } //if (IsSrcCidMatch("03435269", a)) a = a; // debug dbObs.Bits = a.fingerprint; dbObs.Intersect(queryObs); int commonCnt = (int)dbObs.Cardinality(); float simScore = commonCnt / (float)(a.cardinality + QueryFpCardinality - commonCnt); if (simScore >= MinimumSimilarity) { sm = ReadFingerprintRec_To_StructSearchMatch(a); sm.SearchType = StructureSearchType.MolSim; sm.MatchScore = simScore; matchList.Add(sm); } } } catch (Exception ex) { string msg = ex.Message; msg += string.Format("\r\nfi: {0}, fs.Name: {1}, sm: {2}", fi, fs.Name, sm != null ? sm.Serialize() : ""); DebugLog.Message(DebugLog.FormatExceptionMessage(ex, msg)); throw new Exception(msg, ex); } return; }
private int FindIn(OpenBitSet OpenBitSet, int baseVal, int val) { return(-1); }
/// <summary> /// Returns <c>true</c> if the given string is accepted by the automaton. /// <para/> /// Complexity: linear in the length of the string. /// <para/> /// <b>Note:</b> for full performance, use the <see cref="RunAutomaton"/> class. /// </summary> public static bool Run(Automaton a, string s) { if (a.IsSingleton) { return(s.Equals(a.singleton, StringComparison.Ordinal)); } if (a.deterministic) { State p = a.initial; int cp; // LUCENENET: Removed unnecessary assignment for (int i = 0; i < s.Length; i += Character.CharCount(cp)) { State q = p.Step(cp = Character.CodePointAt(s, i)); if (q == null) { return(false); } p = q; } return(p.accept); } else { State[] states = a.GetNumberedStates(); LinkedList <State> pp = new LinkedList <State>(); LinkedList <State> pp_other = new LinkedList <State>(); OpenBitSet bb = new OpenBitSet(states.Length); OpenBitSet bb_other = new OpenBitSet(states.Length); pp.AddLast(a.initial); List <State> dest = new List <State>(); bool accept = a.initial.accept; int c; // LUCENENET: Removed unnecessary assignment for (int i = 0; i < s.Length; i += Character.CharCount(c)) { c = Character.CodePointAt(s, i); accept = false; pp_other.Clear(); bb_other.Clear(0, bb_other.Length - 1); foreach (State p in pp) { dest.Clear(); p.Step(c, dest); foreach (State q in dest) { if (q.accept) { accept = true; } if (!bb_other.Get(q.number)) { bb_other.Set(q.number); pp_other.AddLast(q); } } } LinkedList <State> tp = pp; pp = pp_other; pp_other = tp; OpenBitSet tb = bb; bb = bb_other; bb_other = tb; } return(accept); } }
/// <summary> /// Minimizes the given automaton using Hopcroft's algorithm. /// </summary> public static void MinimizeHopcroft(Automaton a) { a.Determinize(); if (a.initial.numTransitions == 1) { Transition t = a.initial.TransitionsArray[0]; if (t.to == a.initial && t.min == Character.MinCodePoint && t.max == Character.MaxCodePoint) { return; } } a.Totalize(); // initialize data structures int[] sigma = a.GetStartPoints(); State[] states = a.GetNumberedStates(); int sigmaLen = sigma.Length, statesLen = states.Length; JCG.List <State>[,] reverse = new JCG.List <State> [statesLen, sigmaLen]; ISet <State>[] partition = new JCG.HashSet <State> [statesLen]; JCG.List <State>[] splitblock = new JCG.List <State> [statesLen]; int[] block = new int[statesLen]; StateList[,] active = new StateList[statesLen, sigmaLen]; StateListNode[,] active2 = new StateListNode[statesLen, sigmaLen]; Queue <Int32Pair> pending = new Queue <Int32Pair>(); // LUCENENET specific - Queue is much more performant than LinkedList OpenBitSet pending2 = new OpenBitSet(sigmaLen * statesLen); OpenBitSet split = new OpenBitSet(statesLen), refine = new OpenBitSet(statesLen), refine2 = new OpenBitSet(statesLen); for (int q = 0; q < statesLen; q++) { splitblock[q] = new JCG.List <State>(); partition[q] = new JCG.HashSet <State>(); for (int x = 0; x < sigmaLen; x++) { active[q, x] = new StateList(); } } // find initial partition and reverse edges for (int q = 0; q < statesLen; q++) { State qq = states[q]; int j = qq.accept ? 0 : 1; partition[j].Add(qq); block[q] = j; for (int x = 0; x < sigmaLen; x++) { //JCG.List<State>[] r = reverse[qq.Step(sigma[x]).number]; var r = qq.Step(sigma[x]).number; if (reverse[r, x] is null) { reverse[r, x] = new JCG.List <State>(); } reverse[r, x].Add(qq); } } // initialize active sets for (int j = 0; j <= 1; j++) { for (int x = 0; x < sigmaLen; x++) { foreach (State qq in partition[j]) { if (reverse[qq.number, x] != null) { active2[qq.number, x] = active[j, x].Add(qq); } } } } // initialize pending for (int x = 0; x < sigmaLen; x++) { int j = (active[0, x].Count <= active[1, x].Count) ? 0 : 1; pending.Enqueue(new Int32Pair(j, x)); pending2.Set(x * statesLen + j); } // process pending until fixed point int k = 2; while (pending.Count > 0) { Int32Pair ip = pending.Dequeue(); int p = ip.n1; int x = ip.n2; pending2.Clear(x * statesLen + p); // find states that need to be split off their blocks for (StateListNode m = active[p, x].First; m != null; m = m.Next) { JCG.List <State> r = reverse[m.Q.number, x]; if (r != null) { foreach (State s in r) { int i = s.number; if (!split.Get(i)) { split.Set(i); int j = block[i]; splitblock[j].Add(s); if (!refine2.Get(j)) { refine2.Set(j); refine.Set(j); } } } } } // refine blocks for (int j = refine.NextSetBit(0); j >= 0; j = refine.NextSetBit(j + 1)) { JCG.List <State> sb = splitblock[j]; if (sb.Count < partition[j].Count) { ISet <State> b1 = partition[j]; ISet <State> b2 = partition[k]; foreach (State s in sb) { b1.Remove(s); b2.Add(s); block[s.number] = k; for (int c = 0; c < sigmaLen; c++) { StateListNode sn = active2[s.number, c]; if (sn != null && sn.Sl == active[j, c]) { sn.Remove(); active2[s.number, c] = active[k, c].Add(s); } } } // update pending for (int c = 0; c < sigmaLen; c++) { int aj = active[j, c].Count, ak = active[k, c].Count, ofs = c * statesLen; if (!pending2.Get(ofs + j) && 0 < aj && aj <= ak) { pending2.Set(ofs + j); pending.Enqueue(new Int32Pair(j, c)); } else { pending2.Set(ofs + k); pending.Enqueue(new Int32Pair(k, c)); } } k++; } refine2.Clear(j); foreach (State s in sb) { split.Clear(s.number); } sb.Clear(); } refine.Clear(0, refine.Length); } // make a new state for each equivalence class, set initial state State[] newstates = new State[k]; for (int n = 0; n < newstates.Length; n++) { State s = new State(); newstates[n] = s; foreach (State q in partition[n]) { if (q == a.initial) { a.initial = s; } s.accept = q.accept; s.number = q.number; // select representative q.number = n; } } // build transitions and set acceptance for (int n = 0; n < newstates.Length; n++) { State s = newstates[n]; s.accept = states[s.number].accept; foreach (Transition t in states[s.number].GetTransitions()) { s.AddTransition(new Transition(t.min, t.max, newstates[t.to.number])); } } a.ClearNumberedStates(); a.RemoveDeadTransitions(); }
public BitSetRandomAccessDocIdSet(bool multi, MultiValueFacetDataCache multiCache, OpenBitSet openBitSet, FacetDataCache dataCache) { m_multi = multi; m_multiCache = multiCache; m_openBitSet = openBitSet; m_dataCache = dataCache; }
public virtual void UpdateParams(OpenBitSet @set) { _b = GetBitSlice(@set, 0, BYTE_MASK); _exceptionOffset = HEADER_MASK + _b * _batchSize; }
public override void Load(string fieldName, AtomicReader reader, TermListFactory listFactory, BoboSegmentReader.WorkArea workArea) { #if FEATURE_STRING_INTERN string field = string.Intern(fieldName); #else string field = fieldName; #endif int maxdoc = reader.MaxDoc; BigNestedInt32Array.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); BigNestedInt32Array.BufferedLoader weightLoader = GetBufferedLoader(maxdoc, null); var list = (listFactory == null ? new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = GetNegativeValueCount(reader, field); int t = 1; // valid term id starts from 1 list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); m_overflow = false; string pre = null; int df = 0; int minID = -1; int maxID = -1; int docID = -1; int valId = 0; Terms terms = reader.GetTerms(field); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); BytesRef text; while ((text = termsEnum.Next()) != null) { string strText = text.Utf8ToString(); string val = null; int weight = 0; string[] split = strText.Split(new char[] { '\0' }, StringSplitOptions.RemoveEmptyEntries); if (split.Length > 1) { val = split[0]; weight = int.Parse(split[split.Length - 1]); } else { continue; } if (pre == null || !val.Equals(pre)) { if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } list.Add(val); df = 0; minID = -1; maxID = -1; valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; t++; } Term term = new Term(field, strText); DocsEnum docsEnum = reader.GetTermDocsEnum(term); if (docsEnum != null) { while ((docID = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS) { df++; if (!loader.Add(docID, valId)) { LogOverflow(fieldName); } else { weightLoader.Add(docID, weight); } if (docID < minID) { minID = docID; } bitset.FastSet(docID); while (docsEnum.NextDoc() != DocsEnum.NO_MORE_DOCS) { docID = docsEnum.DocID; df++; if (!loader.Add(docID, valId)) { LogOverflow(fieldName); } else { weightLoader.Add(docID, weight); } bitset.FastSet(docID); } if (docID > maxID) { maxID = docID; } } } pre = val; } if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } } list.Seal(); try { m_nestedArray.Load(maxdoc + 1, loader); m_weightArray.Load(maxdoc + 1, weightLoader); } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.m_valArray = list; this.m_freqs = freqList.ToArray(); this.m_minIDs = minIDList.ToArray(); this.m_maxIDs = maxIDList.ToArray(); int doc = 0; while (doc < maxdoc && !m_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc < maxdoc) { this.m_minIDs[0] = doc; doc = maxdoc - 1; while (doc >= 0 && !m_nestedArray.Contains(doc, 0, true)) { --doc; } this.m_maxIDs[0] = doc; } this.m_freqs[0] = maxdoc - (int)bitset.Cardinality(); }
public BitSetCollector(OpenBitSet bitSet) { _bitSet = bitSet; }
public bool IsSubsetOf(BitPointSet other) { return(_cardinality == OpenBitSet.intersectionCount(_points, other._points)); }
public BitPointSet(int maxPoints) { _points = new OpenBitSet(maxPoints); }
///<summary>Method to decompress the entire batch /// * </summary> /// * <param name="blob"> OpenBitSet </param> /// * <returns> int array with decompressed segment of numbers </returns> protected internal virtual int[] Decompress(OpenBitSet blob) { return(new P4DSetNoBase().Decompress(blob)); }
public ThreadClassAnonymousHelper(TestTimeLimitingCollector outerInstance, OpenBitSet success, bool withTimeout, int num) { this.outerInstance = outerInstance; this.success = success; this.withTimeout = withTimeout; this.num = num; }
internal AssertingPostingsConsumer(PostingsConsumer @in, FieldInfo fieldInfo, OpenBitSet visitedDocs) { this.@in = @in; this.fieldInfo = fieldInfo; this.VisitedDocs = visitedDocs; }
/// <summary> /// loads multi-value facet data. This method uses a workarea to prepare loading. /// </summary> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <param name="listFactory"></param> /// <param name="workArea"></param> public virtual void Load(string fieldName, AtomicReader reader, TermListFactory listFactory, BoboSegmentReader.WorkArea workArea) { #if FEATURE_STRING_INTERN string field = string.Intern(fieldName); #else string field = fieldName; #endif int maxdoc = reader.MaxDoc; BigNestedInt32Array.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); ITermValueList list = (listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = GetNegativeValueCount(reader, field); int t = 1; // valid term id starts from 1 list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); m_overflow = false; Terms terms = reader.GetTerms(field); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); BytesRef text; while ((text = termsEnum.Next()) != null) { string strText = text.Utf8ToString(); list.Add(strText); Term term = new Term(field, strText); DocsEnum docsEnum = reader.GetTermDocsEnum(term); int df = 0; int minID = -1; int maxID = -1; int docID = -1; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; while ((docID = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS) { df++; if (!loader.Add(docID, valId)) { LogOverflow(fieldName); } minID = docID; bitset.FastSet(docID); while (docsEnum.NextDoc() != DocsEnum.NO_MORE_DOCS) { docID = docsEnum.DocID; df++; if (!loader.Add(docID, valId)) { LogOverflow(fieldName); } bitset.FastSet(docID); } maxID = docID; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); t++; } } list.Seal(); try { m_nestedArray.Load(maxdoc + 1, loader); } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.m_valArray = list; this.m_freqs = freqList.ToArray(); this.m_minIDs = minIDList.ToArray(); this.m_maxIDs = maxIDList.ToArray(); int doc = 0; while (doc < maxdoc && !m_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc < maxdoc) { this.m_minIDs[0] = doc; doc = maxdoc - 1; while (doc >= 0 && !m_nestedArray.Contains(doc, 0, true)) { --doc; } this.m_maxIDs[0] = doc; } this.m_freqs[0] = maxdoc - (int)bitset.Cardinality(); }
public SearchBits(OpenBitSet openBitSet) { _openBitSet = openBitSet; }
public override DocIdSet GetDocIdSet(Index.IndexReader reader /*, Bits acceptDocs*/, IState state) { var bits = new OpenBitSet(reader.MaxDoc); var terms = new TermsEnumCompatibility(reader, fieldName, state); var term = terms.Next(state); if (term == null) { return(null); } Node scanCell = null; //cells is treated like a stack. LinkedList conveniently has bulk add to beginning. It's in sorted order so that we // always advance forward through the termsEnum index. var cells = new LinkedList <Node>( grid.GetWorldNode().GetSubCells(queryShape)); //This is a recursive algorithm that starts with one or more "big" cells, and then recursively dives down into the // first such cell that intersects with the query shape. It's a depth first traversal because we don't move onto // the next big cell (breadth) until we're completely done considering all smaller cells beneath it. For a given // cell, if it's *within* the query shape then we can conveniently short-circuit the depth traversal and // grab all documents assigned to this cell/term. For an intersection of the cell and query shape, we either // recursively step down another grid level or we decide heuristically (via prefixGridScanLevel) that there aren't // that many points, and so we scan through all terms within this cell (i.e. the term starts with the cell's term), // seeing which ones are within the query shape. while (cells.Count > 0) { Node cell = cells.First.Value; cells.RemoveFirst(); var cellTerm = cell.GetTokenString(); var seekStat = terms.Seek(cellTerm, state); if (seekStat == TermsEnumCompatibility.SeekStatus.END) { break; } if (seekStat == TermsEnumCompatibility.SeekStatus.NOT_FOUND) { continue; } if (cell.GetLevel() == detailLevel || cell.IsLeaf()) { terms.Docs(bits, state); } else { //any other intersection //If the next indexed term is the leaf marker, then add all of them var nextCellTerm = terms.Next(state); Debug.Assert(nextCellTerm.Text.StartsWith(cellTerm)); scanCell = grid.GetNode(nextCellTerm.Text, scanCell); if (scanCell.IsLeaf()) { terms.Docs(bits, state); term = terms.Next(state); //move pointer to avoid potential redundant addDocs() below } //Decide whether to continue to divide & conquer, or whether it's time to scan through terms beneath this cell. // Scanning is a performance optimization trade-off. bool scan = cell.GetLevel() >= prefixGridScanLevel; //simple heuristic if (!scan) { //Divide & conquer var lst = cell.GetSubCells(queryShape); for (var i = lst.Count - 1; i >= 0; i--) //add to beginning { cells.AddFirst(lst[i]); } } else { //Scan through all terms within this cell to see if they are within the queryShape. No seek()s. for (var t = terms.Term(); t != null && t.Text.StartsWith(cellTerm); t = terms.Next(state)) { scanCell = grid.GetNode(t.Text, scanCell); int termLevel = scanCell.GetLevel(); if (termLevel > detailLevel) { continue; } if (termLevel == detailLevel || scanCell.IsLeaf()) { Shape cShape; if (termLevel == grid.GetMaxLevels() && queryShape.HasArea()) { //TODO should put more thought into implications of box vs point cShape = scanCell.GetCenter(); } else { cShape = scanCell.GetShape(); } if (queryShape.Relate(cShape) == SpatialRelation.DISJOINT) { continue; } terms.Docs(bits, state); } } //term loop } } } //cell loop return(bits); }
public int FindValues(OpenBitSet values, int id, int maxID) { return(FindValues(values, id, maxID, false)); }
public OBSDocIdSet(int length) { bitSet = new OpenBitSet(length); }
public SpecialsComparator(OpenBitSet docValues) { _docValues = docValues; }
public abstract int FindValues(OpenBitSet bitset, int id, int maxId);
///<summary>Internal Decompression Method</summary> private int[] decompress(OpenBitSet packedSet) { Console.Error.WriteLine("Method not implemented"); return(null); }