private IDictionary <int, object> HighlightField(string field, string[] contents, BreakIterator bi, BytesRef[] terms, int[] docids, IList <AtomicReaderContext> leaves, int maxPassages, Query query) { IDictionary <int, object> highlights = new Dictionary <int, object>(); PassageFormatter fieldFormatter = GetFormatter(field); if (fieldFormatter == null) { throw new NullReferenceException("PassageFormatter cannot be null"); } // check if we should do any multiterm processing Analyzer analyzer = GetIndexAnalyzer(field); CharacterRunAutomaton[] automata = new CharacterRunAutomaton[0]; if (analyzer != null) { automata = MultiTermHighlighting.ExtractAutomata(query, field); } // resize 'terms', where the last term is the multiterm matcher if (automata.Length > 0) { BytesRef[] newTerms = new BytesRef[terms.Length + 1]; System.Array.Copy(terms, 0, newTerms, 0, terms.Length); terms = newTerms; } // we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes // otherwise, we will just advance() existing enums to the new document in the same segment. DocsAndPositionsEnum[] postings = null; TermsEnum termsEnum = null; int lastLeaf = -1; for (int i = 0; i < docids.Length; i++) { string content = contents[i]; if (content.Length == 0) { continue; // nothing to do } bi.SetText(content); int doc = docids[i]; int leaf = ReaderUtil.SubIndex(doc, leaves); AtomicReaderContext subContext = leaves[leaf]; AtomicReader r = subContext.AtomicReader; Debug.Assert(leaf >= lastLeaf); // increasing order // if the segment has changed, we must initialize new enums. if (leaf != lastLeaf) { Terms t = r.GetTerms(field); if (t != null) { termsEnum = t.GetIterator(null); postings = new DocsAndPositionsEnum[terms.Length]; } } if (termsEnum == null) { continue; // no terms for this field, nothing to do } // if there are multi-term matches, we have to initialize the "fake" enum for each document if (automata.Length > 0) { DocsAndPositionsEnum dp = MultiTermHighlighting.GetDocsEnum(analyzer.GetTokenStream(field, content), automata); dp.Advance(doc - subContext.DocBase); postings[terms.Length - 1] = dp; // last term is the multiterm matcher } Passage[] passages = HighlightDoc(field, terms, content.Length, bi, doc - subContext.DocBase, termsEnum, postings, maxPassages); if (passages.Length == 0) { // no passages were returned, so ask for a default summary passages = GetEmptyHighlight(field, bi, maxPassages); } if (passages.Length > 0) { highlights[doc] = fieldFormatter.Format(passages, content); } lastLeaf = leaf; } return(highlights); }
/// <summary> /// Create the results based on the search hits. /// Can be overridden by subclass to add particular behavior (e.g. weight transformation) </summary> /// <exception cref="IOException"> If there are problems reading fields from the underlying Lucene index. </exception> protected internal virtual IList <LookupResult> CreateResults(IndexSearcher searcher, TopFieldDocs hits, int num, string charSequence, bool doHighlight, ICollection <string> matchedTokens, string prefixToken) { BinaryDocValues textDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, TEXT_FIELD_NAME); // This will just be null if app didn't pass payloads to build(): // TODO: maybe just stored fields? they compress... BinaryDocValues payloadsDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, "payloads"); IList <AtomicReaderContext> leaves = searcher.IndexReader.Leaves; IList <LookupResult> results = new JCG.List <LookupResult>(); BytesRef scratch = new BytesRef(); for (int i = 0; i < hits.ScoreDocs.Length; i++) { FieldDoc fd = (FieldDoc)hits.ScoreDocs[i]; textDV.Get(fd.Doc, scratch); string text = scratch.Utf8ToString(); long score = (J2N.Numerics.Int64)fd.Fields[0]; BytesRef payload; if (payloadsDV != null) { payload = new BytesRef(); payloadsDV.Get(fd.Doc, payload); } else { payload = null; } // Must look up sorted-set by segment: int segment = ReaderUtil.SubIndex(fd.Doc, leaves); SortedSetDocValues contextsDV = leaves[segment].AtomicReader.GetSortedSetDocValues(CONTEXTS_FIELD_NAME); ISet <BytesRef> contexts; if (contextsDV != null) { contexts = new JCG.HashSet <BytesRef>(); contextsDV.SetDocument(fd.Doc - leaves[segment].DocBase); long ord; while ((ord = contextsDV.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { BytesRef context = new BytesRef(); contextsDV.LookupOrd(ord, context); contexts.Add(context); } } else { contexts = null; } LookupResult result; if (doHighlight) { object highlightKey = Highlight(text, matchedTokens, prefixToken); result = new LookupResult(highlightKey.ToString(), highlightKey, score, payload, contexts); } else { result = new LookupResult(text, score, payload, contexts); } results.Add(result); } return(results); }
public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) { if (Debugging.AssertsEnabled) { Debugging.Assert(termStates.TopReaderContext == ReaderUtil.GetTopLevelContext(context), "The top-reader used to create Weight ({0}) is not the same as the current reader's top-reader ({1})", termStates.TopReaderContext, ReaderUtil.GetTopLevelContext(context)); } TermsEnum termsEnum = GetTermsEnum(context); if (termsEnum == null) { return(null); } DocsEnum docs = termsEnum.Docs(acceptDocs, null); if (Debugging.AssertsEnabled) { Debugging.Assert(docs != null); } return(new TermScorer(this, docs, similarity.GetSimScorer(stats, context))); }
internal virtual void TestSort(bool useFrom, bool VERBOSE) { IndexReader reader = null; Directory dir = null; if (!VERBOSE) { Console.WriteLine("Verbosity disabled. Enable manually if needed."); } int numDocs = VERBOSE ? AtLeast(50) : AtLeast(1000); //final int numDocs = AtLeast(50); string[] tokens = new string[] { "a", "b", "c", "d", "e" }; if (VERBOSE) { Console.WriteLine("TEST: make index"); } { dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); // w.setDoRandomForceMerge(false); // w.w.getConfig().SetMaxBufferedDocs(AtLeast(100)); string[] content = new string[AtLeast(20)]; for (int contentIDX = 0; contentIDX < content.Length; contentIDX++) { StringBuilder sb = new StringBuilder(); int numTokens = TestUtil.NextInt32(Random, 1, 10); for (int tokenIDX = 0; tokenIDX < numTokens; tokenIDX++) { sb.Append(tokens[Random.Next(tokens.Length)]).Append(' '); } content[contentIDX] = sb.ToString(); } for (int docIDX = 0; docIDX < numDocs; docIDX++) { Document doc = new Document(); doc.Add(NewStringField("string", TestUtil.RandomRealisticUnicodeString(Random), Field.Store.NO)); doc.Add(NewTextField("text", content[Random.Next(content.Length)], Field.Store.NO)); doc.Add(new SingleField("float", (float)Random.NextDouble(), Field.Store.NO)); int intValue; if (Random.Next(100) == 17) { intValue = int.MinValue; } else if (Random.Next(100) == 17) { intValue = int.MaxValue; } else { intValue = Random.Next(); } doc.Add(new Int32Field("int", intValue, Field.Store.NO)); if (VERBOSE) { Console.WriteLine(" doc=" + doc); } w.AddDocument(doc); } reader = w.GetReader(); w.Dispose(); } // NOTE: sometimes reader has just one segment, which is // important to test IndexSearcher searcher = NewSearcher(reader); IndexReaderContext ctx = searcher.TopReaderContext; ShardSearcher[] subSearchers; int[] docStarts; if (ctx is AtomicReaderContext) { subSearchers = new ShardSearcher[1]; docStarts = new int[1]; subSearchers[0] = new ShardSearcher((AtomicReaderContext)ctx, ctx); docStarts[0] = 0; } else { CompositeReaderContext compCTX = (CompositeReaderContext)ctx; int size = compCTX.Leaves.Count; subSearchers = new ShardSearcher[size]; docStarts = new int[size]; int docBase = 0; for (int searcherIDX = 0; searcherIDX < subSearchers.Length; searcherIDX++) { AtomicReaderContext leave = compCTX.Leaves[searcherIDX]; subSearchers[searcherIDX] = new ShardSearcher(leave, compCTX); docStarts[searcherIDX] = docBase; docBase += leave.Reader.MaxDoc; } } IList <SortField> sortFields = new List <SortField>(); sortFields.Add(new SortField("string", SortFieldType.STRING, true)); sortFields.Add(new SortField("string", SortFieldType.STRING, false)); sortFields.Add(new SortField("int", SortFieldType.INT32, true)); sortFields.Add(new SortField("int", SortFieldType.INT32, false)); sortFields.Add(new SortField("float", SortFieldType.SINGLE, true)); sortFields.Add(new SortField("float", SortFieldType.SINGLE, false)); sortFields.Add(new SortField(null, SortFieldType.SCORE, true)); sortFields.Add(new SortField(null, SortFieldType.SCORE, false)); sortFields.Add(new SortField(null, SortFieldType.DOC, true)); sortFields.Add(new SortField(null, SortFieldType.DOC, false)); for (int iter = 0; iter < 1000 * RandomMultiplier; iter++) { // TODO: custom FieldComp... Query query = new TermQuery(new Term("text", tokens[Random.Next(tokens.Length)])); Sort sort; if (Random.Next(10) == 4) { // Sort by score sort = null; } else { SortField[] randomSortFields = new SortField[TestUtil.NextInt32(Random, 1, 3)]; for (int sortIDX = 0; sortIDX < randomSortFields.Length; sortIDX++) { randomSortFields[sortIDX] = sortFields[Random.Next(sortFields.Count)]; } sort = new Sort(randomSortFields); } int numHits = TestUtil.NextInt32(Random, 1, numDocs + 5); //final int numHits = 5; if (VERBOSE) { Console.WriteLine("TEST: search query=" + query + " sort=" + sort + " numHits=" + numHits); } int from = -1; int size = -1; // First search on whole index: TopDocs topHits; if (sort == null) { if (useFrom) { TopScoreDocCollector c = TopScoreDocCollector.Create(numHits, Random.NextBoolean()); searcher.Search(query, c); from = TestUtil.NextInt32(Random, 0, numHits - 1); size = numHits - from; TopDocs tempTopHits = c.GetTopDocs(); if (from < tempTopHits.ScoreDocs.Length) { // Can't use TopDocs#topDocs(start, howMany), since it has different behaviour when start >= hitCount // than TopDocs#merge currently has ScoreDoc[] newScoreDocs = new ScoreDoc[Math.Min(size, tempTopHits.ScoreDocs.Length - from)]; Array.Copy(tempTopHits.ScoreDocs, from, newScoreDocs, 0, newScoreDocs.Length); tempTopHits.ScoreDocs = newScoreDocs; topHits = tempTopHits; } else { topHits = new TopDocs(tempTopHits.TotalHits, new ScoreDoc[0], tempTopHits.MaxScore); } } else { topHits = searcher.Search(query, numHits); } } else { TopFieldCollector c = TopFieldCollector.Create(sort, numHits, true, true, true, Random.NextBoolean()); searcher.Search(query, c); if (useFrom) { from = TestUtil.NextInt32(Random, 0, numHits - 1); size = numHits - from; TopDocs tempTopHits = c.GetTopDocs(); if (from < tempTopHits.ScoreDocs.Length) { // Can't use TopDocs#topDocs(start, howMany), since it has different behaviour when start >= hitCount // than TopDocs#merge currently has ScoreDoc[] newScoreDocs = new ScoreDoc[Math.Min(size, tempTopHits.ScoreDocs.Length - from)]; Array.Copy(tempTopHits.ScoreDocs, from, newScoreDocs, 0, newScoreDocs.Length); tempTopHits.ScoreDocs = newScoreDocs; topHits = tempTopHits; } else { topHits = new TopDocs(tempTopHits.TotalHits, new ScoreDoc[0], tempTopHits.MaxScore); } } else { topHits = c.GetTopDocs(0, numHits); } } if (VERBOSE) { if (useFrom) { Console.WriteLine("from=" + from + " size=" + size); } Console.WriteLine(" top search: " + topHits.TotalHits + " totalHits; hits=" + (topHits.ScoreDocs == null ? "null" : topHits.ScoreDocs.Length + " maxScore=" + topHits.MaxScore)); if (topHits.ScoreDocs != null) { for (int hitIDX = 0; hitIDX < topHits.ScoreDocs.Length; hitIDX++) { ScoreDoc sd = topHits.ScoreDocs[hitIDX]; Console.WriteLine(" doc=" + sd.Doc + " score=" + sd.Score); } } } // ... then all shards: Weight w = searcher.CreateNormalizedWeight(query); TopDocs[] shardHits = new TopDocs[subSearchers.Length]; for (int shardIDX = 0; shardIDX < subSearchers.Length; shardIDX++) { TopDocs subHits; ShardSearcher subSearcher = subSearchers[shardIDX]; if (sort == null) { subHits = subSearcher.Search(w, numHits); } else { TopFieldCollector c = TopFieldCollector.Create(sort, numHits, true, true, true, Random.NextBoolean()); subSearcher.Search(w, c); subHits = c.GetTopDocs(0, numHits); } shardHits[shardIDX] = subHits; if (VERBOSE) { Console.WriteLine(" shard=" + shardIDX + " " + subHits.TotalHits + " totalHits hits=" + (subHits.ScoreDocs == null ? "null" : subHits.ScoreDocs.Length.ToString())); if (subHits.ScoreDocs != null) { foreach (ScoreDoc sd in subHits.ScoreDocs) { Console.WriteLine(" doc=" + sd.Doc + " score=" + sd.Score); } } } } // Merge: TopDocs mergedHits; if (useFrom) { mergedHits = TopDocs.Merge(sort, from, size, shardHits); } else { mergedHits = TopDocs.Merge(sort, numHits, shardHits); } if (mergedHits.ScoreDocs != null) { // Make sure the returned shards are correct: for (int hitIDX = 0; hitIDX < mergedHits.ScoreDocs.Length; hitIDX++) { ScoreDoc sd = mergedHits.ScoreDocs[hitIDX]; Assert.AreEqual(ReaderUtil.SubIndex(sd.Doc, docStarts), sd.ShardIndex, "doc=" + sd.Doc + " wrong shard"); } } TestUtil.AssertEquals(topHits, mergedHits); } reader.Dispose(); dir.Dispose(); }
public override Scorer Scorer(AtomicReaderContext context, Bits acceptDocs) { Debug.Assert(TermStates.TopReaderContext == ReaderUtil.GetTopLevelContext(context), "The top-reader used to create Weight (" + TermStates.TopReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.GetTopLevelContext(context)); TermsEnum termsEnum = GetTermsEnum(context); if (termsEnum == null) { return(null); } DocsEnum docs = termsEnum.Docs(acceptDocs, null); Debug.Assert(docs != null); return(new TermScorer(this, docs, Similarity.DoSimScorer(Stats, context))); }
public override FunctionValues GetValues(IDictionary context, AtomicReaderContext readerContext) { // Searcher has no numdocs so we must use the reader instead return(new ConstInt32DocValues(ReaderUtil.GetTopLevelContext(readerContext).Reader.NumDocs, this)); }
/// <summary> /// Does all the "real work" of tallying up the counts. </summary> private void Count(IList <FacetsCollector.MatchingDocs> matchingDocs) { //System.out.println("ssdv count"); MultiDocValues.OrdinalMap ordinalMap; // TODO: is this right? really, we need a way to // verify that this ordinalMap "matches" the leaves in // matchingDocs... if (dv is MultiDocValues.MultiSortedSetDocValues && matchingDocs.Count > 1) { ordinalMap = ((MultiDocValues.MultiSortedSetDocValues)dv).Mapping; } else { ordinalMap = null; } IndexReader origReader = state.OrigReader; foreach (FacetsCollector.MatchingDocs hits in matchingDocs) { var reader = hits.Context.AtomicReader; //System.out.println(" reader=" + reader); // LUCENE-5090: make sure the provided reader context "matches" // the top-level reader passed to the // SortedSetDocValuesReaderState, else cryptic // AIOOBE can happen: if (!Equals(ReaderUtil.GetTopLevelContext(hits.Context).Reader, origReader)) { throw new InvalidOperationException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader"); } SortedSetDocValues segValues = reader.GetSortedSetDocValues(field); if (segValues == null) { continue; } DocIdSetIterator docs = hits.Bits.GetIterator(); // TODO: yet another option is to count all segs // first, only in seg-ord space, and then do a // merge-sort-PQ in the end to only "resolve to // global" those seg ords that can compete, if we know // we just want top K? ie, this is the same algo // that'd be used for merging facets across shards // (distributed faceting). but this has much higher // temp ram req'ts (sum of number of ords across all // segs) if (ordinalMap != null) { int segOrd = hits.Context.Ord; int numSegOrds = (int)segValues.ValueCount; if (hits.TotalHits < numSegOrds / 10) { //System.out.println(" remap as-we-go"); // Remap every ord to global ord as we iterate: int doc; while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //System.out.println(" doc=" + doc); segValues.SetDocument(doc); int term = (int)segValues.NextOrd(); while (term != SortedSetDocValues.NO_MORE_ORDS) { //System.out.println(" segOrd=" + segOrd + " ord=" + term + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, term)); counts[(int)ordinalMap.GetGlobalOrd(segOrd, term)]++; term = (int)segValues.NextOrd(); } } } else { //System.out.println(" count in seg ord first"); // First count in seg-ord space: int[] segCounts = new int[numSegOrds]; int doc; while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //System.out.println(" doc=" + doc); segValues.SetDocument(doc); int term = (int)segValues.NextOrd(); while (term != SortedSetDocValues.NO_MORE_ORDS) { //System.out.println(" ord=" + term); segCounts[term]++; term = (int)segValues.NextOrd(); } } // Then, migrate to global ords: for (int ord = 0; ord < numSegOrds; ord++) { int count = segCounts[ord]; if (count != 0) { //System.out.println(" migrate segOrd=" + segOrd + " ord=" + ord + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, ord)); counts[(int)ordinalMap.GetGlobalOrd(segOrd, ord)] += count; } } } } else { // No ord mapping (e.g., single segment index): // just aggregate directly into counts: int doc; while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { segValues.SetDocument(doc); int term = (int)segValues.NextOrd(); while (term != SortedSetDocValues.NO_MORE_ORDS) { counts[term]++; term = (int)segValues.NextOrd(); } } } } }
public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer) { //we may be starting the deserialization here, if thats the case we need to resolve this object as the root if (DocumentRootConverter.TryResolveAsRootData(reader, objectType, serializer, out object obj)) { return(obj); } //read into the 'Data' element return(ReaderUtil.ReadInto( reader as ForkableJsonReader ?? new ForkableJsonReader(reader), DataReadPathRegex, dataReader => { //if they have custom convertors registered, we will respect them var customConvertor = serializer.Converters.FirstOrDefault(x => x.CanRead && x.CanConvert(objectType)); if (customConvertor != null && customConvertor != this) { return customConvertor.ReadJson(reader, objectType, existingValue, serializer); } //if the value has been explicitly set to null then the value of the element is simply null if (dataReader.TokenType == JsonToken.Null) { return null; } var serializationData = SerializationData.GetSerializationData(dataReader); //if we arent given an existing value check the references to see if we have one in there //if we dont have one there then create a new object to populate if (existingValue == null) { var reference = ReaderUtil.ReadAheadToIdentifyObject(dataReader); if (!serializationData.Included.TryGetValue(reference, out existingValue)) { existingValue = CreateObject(objectType, reference.Type, serializer); serializationData.Included.Add(reference, existingValue); } if (existingValue is JObject existingValueJObject) { //sometimes the value in the reference resolver is a JObject. This occurs when we //did not know what type it should be when we first read it (i.e. included was processed //before the item). In these cases we will create a new object and read data from the JObject dataReader = new ForkableJsonReader(existingValueJObject.CreateReader(), dataReader.SerializationDataToken); dataReader.Read(); //JObject readers begin at Not Started existingValue = CreateObject(objectType, reference.Type, serializer); serializationData.Included[reference] = existingValue; } } //additional check to ensure the object we created is of the correct type if (!TypeInfoShim.IsInstanceOf(objectType.GetTypeInfo(), existingValue)) { throw new JsonSerializationException($"Unable to assign object '{existingValue}' to type '{objectType}'"); } PopulateProperties(serializer, existingValue, dataReader); return existingValue; })); }