public virtual void Test2()
        {
            Random            random   = Random;
            int               NUM_DOCS = AtLeast(100);
            Directory         dir      = NewDirectory();
            RandomIndexWriter writer   = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                random, dir);
            bool          allowDups = random.NextBoolean();
            ISet <string> seen      = new JCG.HashSet <string>();

            if (Verbose)
            {
                Console.WriteLine("TEST: NUM_DOCS=" + NUM_DOCS + " allowDups=" + allowDups);
            }
            int numDocs = 0;
            IList <BytesRef> docValues = new JCG.List <BytesRef>();

            // TODO: deletions
            while (numDocs < NUM_DOCS)
            {
                string s;
                if (random.NextBoolean())
                {
                    s = TestUtil.RandomSimpleString(random);
                }
                else
                {
                    s = TestUtil.RandomUnicodeString(random);
                }
                BytesRef br = new BytesRef(s);

                if (!allowDups)
                {
                    if (seen.Contains(s))
                    {
                        continue;
                    }
                    seen.Add(s);
                }

                if (Verbose)
                {
                    Console.WriteLine("  " + numDocs + ": s=" + s);
                }

                Document doc = new Document();
                doc.Add(new SortedDocValuesField("stringdv", br));
                doc.Add(new NumericDocValuesField("id", numDocs));
                docValues.Add(br);
                writer.AddDocument(doc);
                numDocs++;

                if (random.Next(40) == 17)
                {
                    // force flush
                    writer.GetReader().Dispose();
                }
            }

            writer.ForceMerge(1);
            DirectoryReader r = writer.GetReader();

            writer.Dispose();

            AtomicReader sr = GetOnlySegmentReader(r);

            long END_TIME = (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) + (TestNightly ? 30 : 1); // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results

            int NUM_THREADS = TestUtil.NextInt32(LuceneTestCase.Random, 1, 10);

            ThreadJob[] threads = new ThreadJob[NUM_THREADS];
            for (int thread = 0; thread < NUM_THREADS; thread++)
            {
                threads[thread] = new ThreadAnonymousClass2(random, docValues, sr, END_TIME);
                threads[thread].Start();
            }

            foreach (ThreadJob thread in threads)
            {
                thread.Join();
            }

            r.Dispose();
            dir.Dispose();
        }
Esempio n. 2
0
        public void Test()
        {
            RandomIndexWriter writer;
            DirectoryReader   indexReader;
            int numParents        = AtLeast(200);
            IndexWriterConfig cfg = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            cfg.SetMergePolicy(NewLogMergePolicy());
            using (writer = new RandomIndexWriter(Random, NewDirectory(), cfg))
            {
                Document parentDoc = new Document();
                NumericDocValuesField parentVal = new NumericDocValuesField("parent_val", 0L);
                parentDoc.Add(parentVal);
                StringField parent = new StringField("parent", "true", Field.Store.YES);
                parentDoc.Add(parent);
                for (int i = 0; i < numParents; ++i)
                {
                    IList <Document> documents = new JCG.List <Document>();
                    int numChildren            = Random.nextInt(10);
                    for (int j = 0; j < numChildren; ++j)
                    {
                        Document childDoc = new Document();
                        childDoc.Add(new NumericDocValuesField("child_val", Random.nextInt(5)));
                        documents.Add(childDoc);
                    }
                    parentVal.SetInt64Value(Random.nextInt(50));
                    documents.Add(parentDoc);
                    writer.AddDocuments(documents);
                }
                writer.ForceMerge(1);
                indexReader = writer.GetReader();
            }

            AtomicReader     reader        = GetOnlySegmentReader(indexReader);
            Filter           parentsFilter = new FixedBitSetCachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("parent", "true"))));
            FixedBitSet      parentBits    = (FixedBitSet)parentsFilter.GetDocIdSet(reader.AtomicContext, null);
            NumericDocValues parentValues  = reader.GetNumericDocValues("parent_val");

            NumericDocValues childValues = reader.GetNumericDocValues("child_val");

            Sort parentSort = new Sort(new SortField("parent_val", SortFieldType.INT64));
            Sort childSort  = new Sort(new SortField("child_val", SortFieldType.INT64));

            Sort   sort   = new Sort(new SortField("custom", new BlockJoinComparerSource(parentsFilter, parentSort, childSort)));
            Sorter sorter = new Sorter(sort);

            Sorter.DocMap docMap = sorter.Sort(reader);
            assertEquals(reader.MaxDoc, docMap.Count);

            int[] children       = new int[1];
            int   numChildren2   = 0;
            int   previousParent = -1;

            for (int i = 0; i < docMap.Count; ++i)
            {
                int oldID = docMap.NewToOld(i);
                if (parentBits.Get(oldID))
                {
                    // check that we have the right children
                    for (int j = 0; j < numChildren2; ++j)
                    {
                        assertEquals(oldID, parentBits.NextSetBit(children[j]));
                    }
                    // check that children are sorted
                    for (int j = 1; j < numChildren2; ++j)
                    {
                        int doc1 = children[j - 1];
                        int doc2 = children[j];
                        if (childValues.Get(doc1) == childValues.Get(doc2))
                        {
                            assertTrue(doc1 < doc2); // sort is stable
                        }
                        else
                        {
                            assertTrue(childValues.Get(doc1) < childValues.Get(doc2));
                        }
                    }
                    // check that parents are sorted
                    if (previousParent != -1)
                    {
                        if (parentValues.Get(previousParent) == parentValues.Get(oldID))
                        {
                            assertTrue(previousParent < oldID);
                        }
                        else
                        {
                            assertTrue(parentValues.Get(previousParent) < parentValues.Get(oldID));
                        }
                    }
                    // reset
                    previousParent = oldID;
                    numChildren2   = 0;
                }
                else
                {
                    children = ArrayUtil.Grow(children, numChildren2 + 1);
                    children[numChildren2++] = oldID;
                }
            }
            indexReader.Dispose();
            writer.IndexWriter.Directory.Dispose();
        }
Esempio n. 3
0
        /// <summary>
        /// Splits a backslash escaped string on the separator.
        /// <para/>
        /// Current backslash escaping supported:
        /// <para/> \n \t \r \b \f are escaped the same as a .NET string
        /// <para/> Other characters following a backslash are produced verbatim (\c => c)
        /// </summary>
        /// <param name="s">  the string to split </param>
        /// <param name="separator"> the separator to split on </param>
        /// <param name="decode"> decode backslash escaping </param>
        public static IList <string> SplitSmart(string s, string separator, bool decode)
        {
            IList <string> lst = new JCG.List <string>(2);
            StringBuilder  sb = new StringBuilder();
            int            pos = 0, end = s.Length;

            while (pos < end)
            {
                //if (s.StartsWith(separator,pos))
                if (s.Substring(pos).StartsWith(separator, StringComparison.Ordinal))
                {
                    if (sb.Length > 0)
                    {
                        lst.Add(sb.ToString());
                        sb = new StringBuilder();
                    }
                    pos += separator.Length;
                    continue;
                }

                char ch = s[pos++];
                if (ch == '\\')
                {
                    if (!decode)
                    {
                        sb.Append(ch);
                    }
                    if (pos >= end) // ERROR, or let it go?
                    {
                        break;
                    }
                    ch = s[pos++];
                    if (decode)
                    {
                        switch (ch)
                        {
                        case 'n':
                            ch = '\n';
                            break;

                        case 't':
                            ch = '\t';
                            break;

                        case 'r':
                            ch = '\r';
                            break;

                        case 'b':
                            ch = '\b';
                            break;

                        case 'f':
                            ch = '\f';
                            break;
                        }
                    }
                }

                sb.Append(ch);
            }

            if (sb.Length > 0)
            {
                lst.Add(sb.ToString());
            }

            return(lst);
        }
Esempio n. 4
0
        private void QueryToSpanQuery(Query query, ICollection<byte[]> payloads)
        {
            if (query is BooleanQuery booleanQuery)
            {
                BooleanClause[] queryClauses = booleanQuery.GetClauses();

                for (int i = 0; i < queryClauses.Length; i++)
                {
                    if (!queryClauses[i].IsProhibited)
                    {
                        QueryToSpanQuery(queryClauses[i].Query, payloads);
                    }
                }
            }
            else if (query is PhraseQuery phraseQuery)
            {
                Term[] phraseQueryTerms = phraseQuery.GetTerms();
                SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length];
                for (int i = 0; i < phraseQueryTerms.Length; i++)
                {
                    clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
                }

                int slop = phraseQuery.Slop;
                bool inorder = false;

                if (slop == 0)
                {
                    inorder = true;
                }

                SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder) { Boost = query.Boost };
                GetPayloads(payloads, sp);
            }
            else if (query is TermQuery termQuery)
            {
                SpanTermQuery stq = new SpanTermQuery(termQuery.Term) { Boost = query.Boost };
                GetPayloads(payloads, stq);
            }
            else if (query is SpanQuery spanQuery)
            {
                GetPayloads(payloads, spanQuery);
            }
            else if (query is FilteredQuery filteredQuery)
            {
                QueryToSpanQuery(filteredQuery.Query, payloads);
            }
            else if (query is DisjunctionMaxQuery disjunctionMaxQuery)
            {
                foreach (var q in disjunctionMaxQuery)
                {
                    QueryToSpanQuery(q, payloads);
                }
            }
            else if (query is MultiPhraseQuery mpq)
            {
                IList<Term[]> termArrays = mpq.GetTermArrays();
                int[] positions = mpq.GetPositions();
                if (positions.Length > 0)
                {
                    int maxPosition = positions[positions.Length - 1];
                    for (int i = 0; i < positions.Length - 1; ++i)
                    {
                        if (positions[i] > maxPosition)
                        {
                            maxPosition = positions[i];
                        }
                    }

                    // LUCENENET: Changed from Query to SpanQuery to eliminate the O(n) cast
                    // required to instantiate SpanOrQuery below
                    IList<SpanQuery>[] disjunctLists = new JCG.List<SpanQuery>[maxPosition + 1];
                    int distinctPositions = 0;

                    for (int i = 0; i < termArrays.Count; ++i)
                    {
                        Term[] termArray = termArrays[i];
                        IList<SpanQuery> disjuncts = disjunctLists[positions[i]]; // LUCENENET: Changed from Query to SpanQuery
                        if (disjuncts is null)
                        {
                            disjuncts = (disjunctLists[positions[i]] = new JCG.List<SpanQuery>(termArray.Length)); // LUCENENET: Changed from Query to SpanQuery
                            ++distinctPositions;
                        }
                        foreach (Term term in termArray)
                        {
                            disjuncts.Add(new SpanTermQuery(term));
                        }
                    }

                    int positionGaps = 0;
                    int position = 0;
                    SpanQuery[] clauses = new SpanQuery[distinctPositions];
                    for (int i = 0; i < disjunctLists.Length; ++i)
                    {
                        IList<SpanQuery> disjuncts = disjunctLists[i]; // LUCENENET: Changed from Query to SpanQuery
                        if (disjuncts != null)
                        {
                            clauses[position++] = new SpanOrQuery(disjuncts);
                        }
                        else
                        {
                            ++positionGaps;
                        }
                    }

                    int slop = mpq.Slop;
                    bool inorder = (slop == 0);

                    SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
                    sp.Boost = query.Boost;
                    GetPayloads(payloads, sp);
                }
            }
        }
Esempio n. 5
0
        public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles)
        {
            TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);

            // all lines in the file
            Console.WriteLine("  parse...");
            JCG.List <string[]> lines = new JCG.List <string[]>(400000);
            foreach (string file in csvFiles)
            {
                using Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read);
                Encoding   decoder = Encoding.GetEncoding(encoding);
                TextReader reader  = new StreamReader(inputStream, decoder);

                string line = null;
                while ((line = reader.ReadLine()) != null)
                {
                    string[] entry = CSVUtil.Parse(line);

                    if (entry.Length < 13)
                    {
                        Console.WriteLine("Entry in CSV is not valid: " + line);
                        continue;
                    }

                    string[] formatted = FormatEntry(entry);
                    lines.Add(formatted);

                    // NFKC normalize dictionary entry
                    if (normalizeEntries)
                    {
                        //if (normalizer.isNormalized(entry[0])){
                        if (entry[0].IsNormalized(NormalizationForm.FormKC))
                        {
                            continue;
                        }
                        string[] normalizedEntry = new string[entry.Length];
                        for (int i = 0; i < entry.Length; i++)
                        {
                            //normalizedEntry[i] = normalizer.normalize(entry[i]);
                            normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC);
                        }

                        formatted = FormatEntry(normalizedEntry);
                        lines.Add(formatted);
                    }
                }
            }

            Console.WriteLine("  sort...");

            // sort by term: we sorted the files already and use a stable sort.
            lines.Sort(Comparer <string[]> .Create((left, right) => left[0].CompareToOrdinal(right[0])));

            Console.WriteLine("  encode...");

            PositiveInt32Outputs fstOutput  = PositiveInt32Outputs.Singleton;
            Builder <long?>      fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15);
            Int32sRef            scratch    = new Int32sRef();
            long   ord       = -1; // first ord will be 0
            string lastValue = null;

            // build tokeninfo dictionary
            foreach (string[] entry in lines)
            {
                int next = dictionary.Put(entry);

                if (next == offset)
                {
                    Console.WriteLine("Failed to process line: " + Collections.ToString(entry));
                    continue;
                }

                string token = entry[0];
                if (!token.Equals(lastValue, StringComparison.Ordinal))
                {
                    // new word to add to fst
                    ord++;
                    lastValue = token;
                    scratch.Grow(token.Length);
                    scratch.Length = token.Length;
                    for (int i = 0; i < token.Length; i++)
                    {
                        scratch.Int32s[i] = (int)token[i];
                    }
                    fstBuilder.Add(scratch, ord);
                }
                dictionary.AddMapping((int)ord, offset);
                offset = next;
            }

            FST <long?> fst = fstBuilder.Finish();

            Console.WriteLine("  " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes...  ");
            dictionary.SetFST(fst);
            Console.WriteLine(" done");

            return(dictionary);
        }
Esempio n. 6
0
        /// <summary>
        /// Low level api to get the most relevant (formatted) sections of the document.
        /// This method has been made public to allow visibility of score information held in <see cref="TextFragment"/> objects.
        /// Thanks to Jason Calabrese for help in redefining the interface.
        /// </summary>
        /// <exception cref="IOException">If there is a low-level I/O error</exception>
        /// <exception cref="InvalidTokenOffsetsException">thrown if any token's EndOffset exceeds the provided text's length</exception>
        public TextFragment[] GetBestTextFragments(
            TokenStream tokenStream,
            string text,
            bool mergeContiguousFragments,
            int maxNumFragments)
        {
            var docFrags = new JCG.List <TextFragment>();
            var newText  = new StringBuilder();

            var termAtt   = tokenStream.AddAttribute <ICharTermAttribute>();
            var offsetAtt = tokenStream.AddAttribute <IOffsetAttribute>();

            tokenStream.Reset();
            var currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);

            if (_fragmentScorer is QueryScorer queryScorer)
            {
                queryScorer.SetMaxDocCharsToAnalyze(_maxDocCharsToAnalyze);
            }

            var newStream = _fragmentScorer.Init(tokenStream);

            if (newStream != null)
            {
                tokenStream = newStream;
            }
            _fragmentScorer.StartFragment(currentFrag);
            docFrags.Add(currentFrag);

            var fragQueue = new FragmentQueue(maxNumFragments);

            try
            {
                string tokenText;
                int    startOffset;
                int    endOffset;
                int    lastEndOffset = 0;
                _textFragmenter.Start(text, tokenStream);

                var tokenGroup = new TokenGroup(tokenStream);

                for (bool next = tokenStream.IncrementToken();
                     next && (offsetAtt.StartOffset < _maxDocCharsToAnalyze);
                     next = tokenStream.IncrementToken())
                {
                    if ((offsetAtt.EndOffset > text.Length)
                        ||
                        (offsetAtt.StartOffset > text.Length)
                        )
                    {
                        throw new InvalidTokenOffsetsException("Token " + termAtt.ToString()
                                                               + " exceeds length of provided text sized " + text.Length);
                    }
                    if ((tokenGroup.NumTokens > 0) && (tokenGroup.IsDistinct()))
                    {
                        //the current token is distinct from previous tokens -
                        // markup the cached token group info
                        startOffset = tokenGroup.MatchStartOffset;
                        endOffset   = tokenGroup.MatchEndOffset;
                        tokenText   = text.Substring(startOffset, endOffset - startOffset);
                        string markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
                        //store any whitespace etc from between this and last group
                        if (startOffset > lastEndOffset)
                        {
                            newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
                        }
                        newText.Append(markedUpText);
                        lastEndOffset = Math.Max(endOffset, lastEndOffset);
                        tokenGroup.Clear();

                        //check if current token marks the start of a new fragment
                        if (_textFragmenter.IsNewFragment())
                        {
                            currentFrag.Score = _fragmentScorer.FragmentScore;
                            //record stats for a new fragment
                            currentFrag.TextEndPos = newText.Length;
                            currentFrag            = new TextFragment(newText, newText.Length, docFrags.Count);
                            _fragmentScorer.StartFragment(currentFrag);
                            docFrags.Add(currentFrag);
                        }
                    }

                    tokenGroup.AddToken(_fragmentScorer.GetTokenScore());

                    //                if(lastEndOffset>maxDocBytesToAnalyze)
                    //                {
                    //                    break;
                    //                }
                }
                currentFrag.Score = _fragmentScorer.FragmentScore;

                if (tokenGroup.NumTokens > 0)
                {
                    //flush the accumulated text (same code as in above loop)
                    startOffset = tokenGroup.MatchStartOffset;
                    endOffset   = tokenGroup.MatchEndOffset;
                    tokenText   = text.Substring(startOffset, endOffset - startOffset);
                    var markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
                    //store any whitespace etc from between this and last group
                    if (startOffset > lastEndOffset)
                    {
                        newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
                    }
                    newText.Append(markedUpText);
                    lastEndOffset = Math.Max(lastEndOffset, endOffset);
                }

                //Test what remains of the original text beyond the point where we stopped analyzing
                if (
                    //                    if there is text beyond the last token considered..
                    (lastEndOffset < text.Length)
                    &&
                    //                    and that text is not too large...
                    (text.Length <= _maxDocCharsToAnalyze)
                    )
                {
                    //append it to the last fragment
                    newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset)));
                }

                currentFrag.TextEndPos = newText.Length;

                //sort the most relevant sections of the text
                foreach (var f in docFrags)
                {
                    currentFrag = f;

                    //If you are running with a version of Lucene before 11th Sept 03
                    // you do not have PriorityQueue.insert() - so uncomment the code below

                    /*
                     *                  if (currentFrag.getScore() >= minScore)
                     *                  {
                     *                      fragQueue.put(currentFrag);
                     *                      if (fragQueue.size() > maxNumFragments)
                     *                      { // if hit queue overfull
                     *                          fragQueue.pop(); // remove lowest in hit queue
                     *                          minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                     *                      }
                     *                  }
                     */
                    //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
                    //fix to PriorityQueue. The correct method to use here is the new "insert" method
                    // USE ABOVE CODE IF THIS DOES NOT COMPILE!
                    fragQueue.InsertWithOverflow(currentFrag);
                }

                //return the most relevant fragments
                var frag = new TextFragment[fragQueue.Count];
                for (int i = frag.Length - 1; i >= 0; i--)
                {
                    frag[i] = fragQueue.Pop();
                }

                //merge any contiguous fragments to improve readability
                if (mergeContiguousFragments)
                {
                    MergeContiguousFragments(frag);
                    JCG.List <TextFragment> fragTexts = new JCG.List <TextFragment>();
                    for (int i = 0; i < frag.Length; i++)
                    {
                        if ((frag[i] != null) && (frag[i].Score > 0))
                        {
                            fragTexts.Add(frag[i]);
                        }
                    }
                    frag = new TextFragment[fragTexts.Count];
                    fragTexts.CopyTo(frag);
                }

                return(frag);
            }
            finally
            {
                if (tokenStream != null)
                {
                    try
                    {
                        tokenStream.End();
                        tokenStream.Dispose();
                    }
                    catch (Exception e) when(e.IsException())
                    {
                    }
                }
            }
        }
Esempio n. 7
0
        public virtual void TestRandom()
        {
            string[]        tokens   = GetRandomTokens(10);
            Store.Directory indexDir = NewDirectory();
            Store.Directory taxoDir  = NewDirectory();

            RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, indexDir);
            var             tw       = new DirectoryTaxonomyWriter(taxoDir);
            FacetsConfig    config   = new FacetsConfig();
            int             numDocs  = AtLeast(1000);
            int             numDims  = TestUtil.NextInt32(Random, 1, 7);
            IList <TestDoc> testDocs = GetRandomDocs(tokens, numDocs, numDims);

            foreach (TestDoc testDoc in testDocs)
            {
                Document doc = new Document();
                doc.Add(NewStringField("content", testDoc.content, Field.Store.NO));
                testDoc.value = Random.NextSingle();
                doc.Add(new SingleDocValuesField("value", testDoc.value));
                for (int j = 0; j < numDims; j++)
                {
                    if (testDoc.dims[j] != null)
                    {
                        doc.Add(new FacetField("dim" + j, testDoc.dims[j]));
                    }
                }
                w.AddDocument(config.Build(tw, doc));
            }

            // NRT open
            IndexSearcher searcher = NewSearcher(w.GetReader());

            // NRT open
            var tr = new DirectoryTaxonomyReader(tw);

            ValueSource values = new SingleFieldSource("value");

            int iters = AtLeast(100);

            for (int iter = 0; iter < iters; iter++)
            {
                string searchToken = tokens[Random.Next(tokens.Length)];
                if (Verbose)
                {
                    Console.WriteLine("\nTEST: iter content=" + searchToken);
                }
                FacetsCollector fc = new FacetsCollector();
                FacetsCollector.Search(searcher, new TermQuery(new Term("content", searchToken)), 10, fc);
                Facets facets = new TaxonomyFacetSumValueSource(tr, config, fc, values);

                // Slow, yet hopefully bug-free, faceting:
                var expectedValues = new JCG.List <Dictionary <string, float?> >(numDims);
                for (int i = 0; i < numDims; i++)
                {
                    expectedValues.Add(new Dictionary <string, float?>());
                }

                foreach (TestDoc doc in testDocs)
                {
                    if (doc.content.Equals(searchToken, StringComparison.Ordinal))
                    {
                        for (int j = 0; j < numDims; j++)
                        {
                            if (doc.dims[j] != null)
                            {
                                if (!expectedValues[j].TryGetValue(doc.dims[j], out float?v) || v == null)
                                {
                                    expectedValues[j][doc.dims[j]] = doc.value;
                                }
                                else
                                {
                                    expectedValues[j][doc.dims[j]] = (float)v + doc.value;
                                }
                            }
                        }
                    }
                }

                JCG.List <FacetResult> expected = new JCG.List <FacetResult>();
                for (int i = 0; i < numDims; i++)
                {
                    JCG.List <LabelAndValue> labelValues = new JCG.List <LabelAndValue>();
                    float totValue = 0;
                    foreach (KeyValuePair <string, float?> ent in expectedValues[i])
                    {
                        labelValues.Add(new LabelAndValue(ent.Key, ent.Value.Value));
                        totValue += ent.Value.Value;
                    }
                    SortLabelValues(labelValues);
                    if (totValue > 0)
                    {
                        expected.Add(new FacetResult("dim" + i, new string[0], totValue, labelValues.ToArray(), labelValues.Count));
                    }
                }

                // Sort by highest value, tie break by value:
                SortFacetResults(expected);

                IList <FacetResult> actual = facets.GetAllDims(10);

                // Messy: fixup ties
                SortTies(actual);

                if (Verbose)
                {
                    Console.WriteLine("expected=\n" + expected.ToString());
                    Console.WriteLine("actual=\n" + actual.ToString());
                }

                AssertFloatValuesEquals(expected, actual);
            }

            IOUtils.Dispose(w, tw, searcher.IndexReader, tr, indexDir, taxoDir);
        }
Esempio n. 8
0
        /// <summary>
        /// Internal helper method used by check that iterates over
        /// the keys of <paramref name="readerFieldToValIds"/> and generates a <see cref="ICollection{T}"/>
        /// of <see cref="Insanity"/> instances whenever two (or more) <see cref="ReaderField"/> instances are
        /// found that have an ancestry relationships.
        /// </summary>
        /// <seealso cref="InsanityType.SUBREADER"/>
        private static ICollection <Insanity> CheckSubreaders(MapOfSets <int, FieldCache.CacheEntry> valIdToItems, MapOfSets <ReaderField, int> readerFieldToValIds) // LUCENENET: CA1822: Mark members as static
        {
            JCG.List <Insanity> insanity = new JCG.List <Insanity>(23);

            Dictionary <ReaderField, ISet <ReaderField> > badChildren = new Dictionary <ReaderField, ISet <ReaderField> >(17);
            MapOfSets <ReaderField, ReaderField>          badKids     = new MapOfSets <ReaderField, ReaderField>(badChildren); // wrapper

            IDictionary <int, ISet <FieldCache.CacheEntry> > viToItemSets  = valIdToItems.Map;
            IDictionary <ReaderField, ISet <int> >           rfToValIdSets = readerFieldToValIds.Map;

            HashSet <ReaderField> seen = new HashSet <ReaderField>();

            //IDictionary<ReaderField, ISet<int>>.KeyCollection readerFields = rfToValIdSets.Keys;
            foreach (ReaderField rf in rfToValIdSets.Keys)
            {
                if (seen.Contains(rf))
                {
                    continue;
                }

                IList <object> kids = GetAllDescendantReaderKeys(rf.ReaderKey);
                foreach (object kidKey in kids)
                {
                    ReaderField kid = new ReaderField(kidKey, rf.FieldName);

                    // LUCENENET: Eliminated extra lookup by using TryGetValue instead of ContainsKey
                    if (badChildren.TryGetValue(kid, out ISet <ReaderField> badKid))
                    {
                        // we've already process this kid as RF and found other problems
                        // track those problems as our own
                        badKids.Put(rf, kid);
                        badKids.PutAll(rf, badKid);
                        badChildren.Remove(kid);
                    }
                    else if (rfToValIdSets.ContainsKey(kid))
                    {
                        // we have cache entries for the kid
                        badKids.Put(rf, kid);
                    }
                    seen.Add(kid);
                }
                seen.Add(rf);
            }

            // every mapping in badKids represents an Insanity
            foreach (ReaderField parent in badChildren.Keys)
            {
                ISet <ReaderField> kids = badChildren[parent];

                JCG.List <FieldCache.CacheEntry> badEntries = new JCG.List <FieldCache.CacheEntry>(kids.Count * 2);

                // put parent entr(ies) in first
                {
                    foreach (int value in rfToValIdSets[parent])
                    {
                        badEntries.AddRange(viToItemSets[value]);
                    }
                }

                // now the entries for the descendants
                foreach (ReaderField kid in kids)
                {
                    foreach (int value in rfToValIdSets[kid])
                    {
                        badEntries.AddRange(viToItemSets[value]);
                    }
                }

                FieldCache.CacheEntry[] badness = badEntries.ToArray();

                insanity.Add(new Insanity(InsanityType.SUBREADER, "Found caches for descendants of " + parent.ToString(), badness));
            }

            return(insanity);
        }
Esempio n. 9
0
        private void TestCase(int itrsWithVal, int specifiedValsOnItr, bool removeDups)
        {
            // Build a random number of lists
            IList <int?> expected = new JCG.List <int?>();
            Random       random   = new Random(Random.Next());
            int          numLists = itrsWithVal + random.Next(1000 - itrsWithVal);

            IList <int>[] lists = new IList <int> [numLists];
            for (int i = 0; i < numLists; i++)
            {
                lists[i] = new JCG.List <int>();
            }
            int start = random.Next(1000000);
            int end   = start + VALS_TO_MERGE / itrsWithVal / Math.Abs(specifiedValsOnItr);

            for (int i = start; i < end; i++)
            {
                int maxList      = lists.Length;
                int maxValsOnItr = 0;
                int sumValsOnItr = 0;
                for (int itrWithVal = 0; itrWithVal < itrsWithVal; itrWithVal++)
                {
                    int list      = random.Next(maxList);
                    int valsOnItr = specifiedValsOnItr < 0 ? (1 + random.Next(-specifiedValsOnItr)) : specifiedValsOnItr;
                    maxValsOnItr  = Math.Max(maxValsOnItr, valsOnItr);
                    sumValsOnItr += valsOnItr;
                    for (int valOnItr = 0; valOnItr < valsOnItr; valOnItr++)
                    {
                        lists[list].Add(i);
                    }
                    maxList = maxList - 1;
                    ArrayUtil.Swap(lists, list, maxList);
                }
                int maxCount = removeDups ? maxValsOnItr : sumValsOnItr;
                for (int count = 0; count < maxCount; count++)
                {
                    expected.Add(i);
                }
            }
            // Now check that they get merged cleanly
            IEnumerator <int>[] itrs = new IEnumerator <int> [numLists];
            for (int i = 0; i < numLists; i++)
            {
                itrs[i] = lists[i].GetEnumerator();
            }
            try
            {
                MergedEnumerator <int> mergedItr   = new MergedEnumerator <int>(removeDups, itrs);
                IEnumerator <int?>     expectedItr = expected.GetEnumerator();
                while (expectedItr.MoveNext())
                {
                    Assert.IsTrue(mergedItr.MoveNext());
                    Assert.AreEqual(expectedItr.Current, mergedItr.Current);
                }
                Assert.IsFalse(mergedItr.MoveNext());
            }
            finally
            {
                IOUtils.Dispose(itrs);
            }
        }
Esempio n. 10
0
        public virtual void TestRandomStoredFields()
        {
            using Directory dir = NewDirectory();
            Random rand = Random;

            using RandomIndexWriter w = new RandomIndexWriter(rand, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetMaxBufferedDocs(TestUtil.NextInt32(rand, 5, 20)));
            //w.w.setNoCFSRatio(0.0);
            int docCount   = AtLeast(200);
            int fieldCount = TestUtil.NextInt32(rand, 1, 5);

            IList <int> fieldIDs = new JCG.List <int>();

            FieldType customType = new FieldType(TextField.TYPE_STORED);

            customType.IsTokenized = false;
            Field idField = NewField("id", "", customType);

            for (int i = 0; i < fieldCount; i++)
            {
                fieldIDs.Add(i);
            }

            IDictionary <string, Document> docs = new Dictionary <string, Document>();

            if (Verbose)
            {
                Console.WriteLine("TEST: build index docCount=" + docCount);
            }

            FieldType customType2 = new FieldType();

            customType2.IsStored = true;
            for (int i = 0; i < docCount; i++)
            {
                Document doc = new Document();
                doc.Add(idField);
                string id = "" + i;
                idField.SetStringValue(id);
                docs[id] = doc;
                if (Verbose)
                {
                    Console.WriteLine("TEST: add doc id=" + id);
                }

                foreach (int field in fieldIDs)
                {
                    string s;
                    if (rand.Next(4) != 3)
                    {
                        s = TestUtil.RandomUnicodeString(rand, 1000);
                        doc.Add(NewField("f" + field, s, customType2));
                    }
                    else
                    {
                        s = null;
                    }
                }
                w.AddDocument(doc);
                if (rand.Next(50) == 17)
                {
                    // mixup binding of field name -> Number every so often
                    fieldIDs.Shuffle(Random);
                }
                if (rand.Next(5) == 3 && i > 0)
                {
                    string delID = "" + rand.Next(i);
                    if (Verbose)
                    {
                        Console.WriteLine("TEST: delete doc id=" + delID);
                    }
                    w.DeleteDocuments(new Term("id", delID));
                    docs.Remove(delID);
                }
            }

            if (Verbose)
            {
                Console.WriteLine("TEST: " + docs.Count + " docs in index; now load fields");
            }
            if (docs.Count > 0)
            {
                string[] idsList = docs.Keys.ToArray(/*new string[docs.Count]*/);

                for (int x = 0; x < 2; x++)
                {
                    using (IndexReader r = w.GetReader())
                    {
                        IndexSearcher s = NewSearcher(r);

                        if (Verbose)
                        {
                            Console.WriteLine("TEST: cycle x=" + x + " r=" + r);
                        }

                        int num = AtLeast(1000);
                        for (int iter = 0; iter < num; iter++)
                        {
                            string testID = idsList[rand.Next(idsList.Length)];
                            if (Verbose)
                            {
                                Console.WriteLine("TEST: test id=" + testID);
                            }
                            TopDocs hits = s.Search(new TermQuery(new Term("id", testID)), 1);
                            Assert.AreEqual(1, hits.TotalHits);
                            Document doc    = r.Document(hits.ScoreDocs[0].Doc);
                            Document docExp = docs[testID];
                            for (int i = 0; i < fieldCount; i++)
                            {
                                assertEquals("doc " + testID + ", field f" + fieldCount + " is wrong", docExp.Get("f" + i), doc.Get("f" + i));
                            }
                        }
                    } // r.Dispose();
                    w.ForceMerge(1);
                }
            }
        }
Esempio n. 11
0
        /// <summary>
        /// Tests a CacheEntry[] for indication of "insane" cache usage.
        /// <para>
        /// <b>NOTE:</b>FieldCache CreationPlaceholder objects are ignored.
        /// (:TODO: is this a bad idea? are we masking a real problem?)
        /// </para>
        /// </summary>
        public Insanity[] Check(params FieldCache.CacheEntry[] cacheEntries)
        {
            if (null == cacheEntries || 0 == cacheEntries.Length)
            {
                return(Arrays.Empty <Insanity>());
            }

            if (estimateRam)
            {
                for (int i = 0; i < cacheEntries.Length; i++)
                {
                    cacheEntries[i].EstimateSize();
                }
            }

            // the indirect mapping lets MapOfSet dedup identical valIds for us
            // maps the (valId) identityhashCode of cache values to
            // sets of CacheEntry instances
            MapOfSets <int, FieldCache.CacheEntry> valIdToItems = new MapOfSets <int, FieldCache.CacheEntry>(new Dictionary <int, ISet <FieldCache.CacheEntry> >(17));
            // maps ReaderField keys to Sets of ValueIds
            MapOfSets <ReaderField, int> readerFieldToValIds = new MapOfSets <ReaderField, int>(new Dictionary <ReaderField, ISet <int> >(17));

            // any keys that we know result in more then one valId
            ISet <ReaderField> valMismatchKeys = new JCG.HashSet <ReaderField>();

            // iterate over all the cacheEntries to get the mappings we'll need
            for (int i = 0; i < cacheEntries.Length; i++)
            {
                FieldCache.CacheEntry item = cacheEntries[i];
                object val = item.Value;

                // It's OK to have dup entries, where one is eg
                // float[] and the other is the Bits (from
                // getDocWithField())
                if (val is IBits)
                {
                    continue;
                }

                if (val is FieldCache.ICreationPlaceholder)
                {
                    continue;
                }

                ReaderField rf = new ReaderField(item.ReaderKey, item.FieldName);

                int valId = RuntimeHelpers.GetHashCode(val);

                // indirect mapping, so the MapOfSet will dedup identical valIds for us
                valIdToItems.Put(valId, item);
                if (1 < readerFieldToValIds.Put(rf, valId))
                {
                    valMismatchKeys.Add(rf);
                }
            }

            JCG.List <Insanity> insanity = new JCG.List <Insanity>(valMismatchKeys.Count * 3);

            insanity.AddRange(CheckValueMismatch(valIdToItems, readerFieldToValIds, valMismatchKeys));
            insanity.AddRange(CheckSubreaders(valIdToItems, readerFieldToValIds));

            return(insanity.ToArray());
        }
Esempio n. 12
0
        public virtual void runTestQuery(SpatialMatchConcern concern, SpatialTestQuery q)
        {
            String        msg = q.toString(); //"Query: " + q.args.toString(ctx);
            SearchResults got = executeQuery(makeQuery(q), Math.Max(100, q.ids.size() + 1));

            if (storeShape && got.numFound > 0)
            {
                //check stored value is there
                assertNotNull(got.results[0].document.Get(strategy.FieldName));
            }
            if (concern.orderIsImportant)
            {
                IEnumerator <String> ids = q.ids.GetEnumerator();
                foreach (SearchResult r in got.results)
                {
                    String id = r.document.Get("id");
                    if (!ids.MoveNext())
                    {
                        fail(msg + " :: Did not get enough results.  Expect" + q.ids + ", got: " + got.toDebugString());
                    }
                    assertEquals("out of order: " + msg, ids.Current, id);
                }

                if (ids.MoveNext())
                {
                    fail(msg + " :: expect more results then we got: " + ids.Current);
                }
            }
            else
            {
                // We are looking at how the results overlap
                if (concern.resultsAreSuperset)
                {
                    ISet <string> found = new JCG.HashSet <string>();
                    foreach (SearchResult r in got.results)
                    {
                        found.add(r.document.Get("id"));
                    }
                    foreach (String s in q.ids)
                    {
                        if (!found.contains(s))
                        {
                            fail("Results are mising id: " + s + " :: " + found);
                        }
                    }
                }
                else
                {
                    IList <string> found = new JCG.List <string>();
                    foreach (SearchResult r in got.results)
                    {
                        found.Add(r.document.Get("id"));
                    }

                    // sort both so that the order is not important
                    CollectionUtil.TimSort(q.ids);
                    CollectionUtil.TimSort(found);
                    assertEquals(msg, q.ids.ToString(), found.ToString());
                }
            }
        }
Esempio n. 13
0
        private void AssertTermsSeeking(Terms leftTerms, Terms rightTerms)
        {
            TermsEnum leftEnum  = null;
            TermsEnum rightEnum = null;

            // just an upper bound
            int    numTests = AtLeast(20);
            Random random   = Random;

            // collect this number of terms from the left side
            ISet <BytesRef> tests     = new JCG.HashSet <BytesRef>();
            int             numPasses = 0;

            while (numPasses < 10 && tests.Count < numTests)
            {
                leftEnum = leftTerms.GetEnumerator(leftEnum);
                BytesRef term = null;
                while (leftEnum.MoveNext())
                {
                    term = leftEnum.Term;
                    int code = random.Next(10);
                    if (code == 0)
                    {
                        // the term
                        tests.Add(BytesRef.DeepCopyOf(term));
                    }
                    else if (code == 1)
                    {
                        // truncated subsequence of term
                        term = BytesRef.DeepCopyOf(term);
                        if (term.Length > 0)
                        {
                            // truncate it
                            term.Length = random.Next(term.Length);
                        }
                    }
                    else if (code == 2)
                    {
                        // term, but ensure a non-zero offset
                        var newbytes = new byte[term.Length + 5];
                        Array.Copy(term.Bytes, term.Offset, newbytes, 5, term.Length);
                        tests.Add(new BytesRef(newbytes, 5, term.Length));
                    }
                }
                numPasses++;
            }

            IList <BytesRef> shuffledTests = new JCG.List <BytesRef>(tests);

            shuffledTests.Shuffle(Random);

            foreach (BytesRef b in shuffledTests)
            {
                leftEnum  = leftTerms.GetEnumerator(leftEnum);
                rightEnum = rightTerms.GetEnumerator(rightEnum);

                Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b));
                Assert.AreEqual(leftEnum.SeekExact(b), rightEnum.SeekExact(b));

                SeekStatus leftStatus;
                SeekStatus rightStatus;

                leftStatus  = leftEnum.SeekCeil(b);
                rightStatus = rightEnum.SeekCeil(b);
                Assert.AreEqual(leftStatus, rightStatus);
                if (leftStatus != SeekStatus.END)
                {
                    Assert.AreEqual(leftEnum.Term, rightEnum.Term);
                }

                leftStatus  = leftEnum.SeekCeil(b);
                rightStatus = rightEnum.SeekCeil(b);
                Assert.AreEqual(leftStatus, rightStatus);
                if (leftStatus != SeekStatus.END)
                {
                    Assert.AreEqual(leftEnum.Term, rightEnum.Term);
                }
            }
        }
Esempio n. 14
0
        /// <summary>
        /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>,
        /// accumulating the <see cref="FST"/> end node and output for each path.
        /// </summary>
        public static IList <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst) where T : class // LUCENENET specific - added class constraint because we are comparing reference equality
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(a.IsDeterministic);
            }
            IList <Path <T> > queue    = new JCG.List <Path <T> >();
            IList <Path <T> > endNodes = new JCG.List <Path <T> >();

            queue.Add(new Path <T>(a.GetInitialState(), fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new Int32sRef()));

            FST.Arc <T>     scratchArc = new FST.Arc <T>();
            FST.BytesReader fstReader  = fst.GetBytesReader();

            while (queue.Count != 0)
            {
                Path <T> path = queue[queue.Count - 1];
                queue.Remove(path);
                if (path.State.Accept)
                {
                    endNodes.Add(path);
                    // we can stop here if we accept this path,
                    // we accept all further paths too
                    continue;
                }

                Int32sRef currentInput = path.Input;
                foreach (Transition t in path.State.GetTransitions())
                {
                    int min = t.Min;
                    int max = t.Max;
                    if (min == max)
                    {
                        FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.FstNode, scratchArc, fstReader);
                        if (nextArc != null)
                        {
                            Int32sRef newInput = new Int32sRef(currentInput.Length + 1);
                            newInput.CopyInt32s(currentInput);
                            newInput.Int32s[currentInput.Length] = t.Min;
                            newInput.Length = currentInput.Length + 1;
                            queue.Add(new Path <T>(t.Dest, new FST.Arc <T>()
                                                   .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput));
                        }
                    }
                    else
                    {
                        // TODO: if this transition's TO state is accepting, and
                        // it accepts the entire range possible in the FST (ie. 0 to 255),
                        // we can simply use the prefix as the accepted state instead of
                        // looking up all the ranges and terminate early
                        // here.  This just shifts the work from one queue
                        // (this one) to another (the completion search
                        // done in AnalyzingSuggester).

                        FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.FstNode, scratchArc, fstReader);
                        while (nextArc != null && nextArc.Label <= max)
                        {
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(nextArc.Label <= max);
                            }
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(nextArc.Label >= min, "{0} {1}", nextArc.Label, min);
                            }
                            Int32sRef newInput = new Int32sRef(currentInput.Length + 1);
                            newInput.CopyInt32s(currentInput);
                            newInput.Int32s[currentInput.Length] = nextArc.Label;
                            newInput.Length = currentInput.Length + 1;
                            queue.Add(new Path <T>(t.Dest, new FST.Arc <T>()
                                                   .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput));
                            int label = nextArc.Label; // used in assert
                            nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader);
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(nextArc is null || label < nextArc.Label, "last: {0} next: {1}", label, nextArc?.Label);
                            }
                        }
                    }
                }
            }
            return(endNodes);
        }
Esempio n. 15
0
        private void DoTest(int inputMode, Int32sRef[] terms)
        {
            Array.Sort(terms);

            // Up to two positive ints, shared, generally but not
            // monotonically increasing
            {
                if (Verbose)
                {
                    Console.WriteLine("TEST: now test UpToTwoPositiveIntOutputs");
                }
                UpToTwoPositiveInt64Outputs   outputs = UpToTwoPositiveInt64Outputs.GetSingleton(true);
                IList <InputOutput <object> > pairs   = new JCG.List <InputOutput <object> >(terms.Length);
                long lastOutput = 0;
                for (int idx = 0; idx < terms.Length; idx++)
                {
                    // Sometimes go backwards
                    long value = lastOutput + TestUtil.NextInt32(Random, -100, 1000);
                    while (value < 0)
                    {
                        value = lastOutput + TestUtil.NextInt32(Random, -100, 1000);
                    }
                    object output;
                    if (Random.nextInt(5) == 3)
                    {
                        long value2 = lastOutput + TestUtil.NextInt32(Random, -100, 1000);
                        while (value2 < 0)
                        {
                            value2 = lastOutput + TestUtil.NextInt32(Random, -100, 1000);
                        }
                        IList <long> values = new JCG.List <long>();
                        values.Add(value);
                        values.Add(value2);
                        output = values;
                    }
                    else
                    {
                        output = outputs.Get(value);
                    }
                    pairs.Add(new InputOutput <object>(terms[idx], output));
                }
                new FSTTesterHelper <object>(Random, dir, inputMode, pairs, outputs, false).DoTest(false);

                // ListOfOutputs(PositiveIntOutputs), generally but not
                // monotonically increasing
                {
                    if (Verbose)
                    {
                        Console.WriteLine("TEST: now test OneOrMoreOutputs");
                    }
                    PositiveInt32Outputs          _outputs = PositiveInt32Outputs.Singleton;
                    ListOfOutputs <long?>         outputs2 = new ListOfOutputs <long?>(_outputs);
                    IList <InputOutput <object> > pairs2   = new JCG.List <InputOutput <object> >(terms.Length);
                    long lastOutput2 = 0;
                    for (int idx = 0; idx < terms.Length; idx++)
                    {
                        int           outputCount = TestUtil.NextInt32(Random, 1, 7);
                        IList <long?> values      = new JCG.List <long?>();
                        for (int i = 0; i < outputCount; i++)
                        {
                            // Sometimes go backwards
                            long value = lastOutput2 + TestUtil.NextInt32(Random, -100, 1000);
                            while (value < 0)
                            {
                                value = lastOutput2 + TestUtil.NextInt32(Random, -100, 1000);
                            }
                            values.Add(value);
                            lastOutput2 = value;
                        }

                        object output;
                        if (values.size() == 1)
                        {
                            output = values[0];
                        }
                        else
                        {
                            output = values;
                        }

                        pairs2.Add(new InputOutput <object>(terms[idx], output));
                    }
                    new FSTTester <object>(Random, dir, inputMode, pairs2, outputs2, false).DoTest(false);
                }
            }
        }
Esempio n. 16
0
 /// <summary>
 /// Extracts all <see cref="MultiTermQuery"/>s for <paramref name="field"/>, and returns equivalent
 /// automata that will match terms.
 /// </summary>
 internal static CharacterRunAutomaton[] ExtractAutomata(Query query, string field)
 {
     JCG.List <CharacterRunAutomaton> list = new JCG.List <CharacterRunAutomaton>();
     if (query is BooleanQuery booleanQuery)
     {
         foreach (BooleanClause clause in booleanQuery.GetClauses())
         {
             if (!clause.IsProhibited)
             {
                 list.AddRange(ExtractAutomata(clause.Query, field));
             }
         }
     }
     else if (query is DisjunctionMaxQuery disjunctionMaxQuery)
     {
         foreach (Query sub in disjunctionMaxQuery.Disjuncts)
         {
             list.AddRange(ExtractAutomata(sub, field));
         }
     }
     else if (query is SpanOrQuery spanOrQuery)
     {
         foreach (Query sub in spanOrQuery.GetClauses())
         {
             list.AddRange(ExtractAutomata(sub, field));
         }
     }
     else if (query is SpanNearQuery spanNearQuery)
     {
         foreach (Query sub in spanNearQuery.GetClauses())
         {
             list.AddRange(ExtractAutomata(sub, field));
         }
     }
     else if (query is SpanNotQuery spanNotQuery)
     {
         list.AddRange(ExtractAutomata(spanNotQuery.Include, field));
     }
     else if (query is SpanPositionCheckQuery spanPositionCheckQuery)
     {
         list.AddRange(ExtractAutomata(spanPositionCheckQuery.Match, field));
     }
     else if (query is ISpanMultiTermQueryWrapper spanMultiTermQueryWrapper)
     {
         list.AddRange(ExtractAutomata(spanMultiTermQueryWrapper.WrappedQuery, field));
     }
     else if (query is AutomatonQuery aq)
     {
         if (aq.Field.Equals(field, StringComparison.Ordinal))
         {
             list.Add(new CharacterRunAutomatonToStringAnonymousClass(aq.Automaton, () => aq.ToString()));
         }
     }
     else if (query is PrefixQuery pq)
     {
         Term prefix = pq.Prefix;
         if (prefix.Field.Equals(field, StringComparison.Ordinal))
         {
             list.Add(new CharacterRunAutomatonToStringAnonymousClass(
                          BasicOperations.Concatenate(BasicAutomata.MakeString(prefix.Text), BasicAutomata.MakeAnyString()),
                          () => pq.ToString()));
         }
     }
     else if (query is FuzzyQuery fq)
     {
         if (fq.Field.Equals(field, StringComparison.Ordinal))
         {
             string utf16    = fq.Term.Text;
             int[]  termText = new int[utf16.CodePointCount(0, utf16.Length)];
             for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp))
             {
                 termText[j++] = cp = utf16.CodePointAt(i);
             }
             int    termLength             = termText.Length;
             int    prefixLength           = Math.Min(fq.PrefixLength, termLength);
             string suffix                 = UnicodeUtil.NewString(termText, prefixLength, termText.Length - prefixLength);
             LevenshteinAutomata builder   = new LevenshteinAutomata(suffix, fq.Transpositions);
             Automaton           automaton = builder.ToAutomaton(fq.MaxEdits);
             if (prefixLength > 0)
             {
                 Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(termText, 0, prefixLength));
                 automaton = BasicOperations.Concatenate(prefix, automaton);
             }
             list.Add(new CharacterRunAutomatonToStringAnonymousClass(automaton, () => fq.ToString()));
         }
     }
     else if (query is TermRangeQuery tq)
     {
         if (tq.Field.Equals(field, StringComparison.Ordinal))
         {
             // this is *not* an automaton, but its very simple
             list.Add(new SimpleCharacterRunAutomatonAnonymousClass(BasicAutomata.MakeEmpty(), tq));
         }
     }
     return(list.ToArray(/*new CharacterRunAutomaton[list.size()]*/));
 }
Esempio n. 17
0
        public virtual void TestCRTReopen()
        {
            //test behaving badly

            //should be high enough
            int maxStaleSecs = 20;

            //build crap data just to store it.
            string s = "        abcdefghijklmnopqrstuvwxyz     ";

            char[]        chars   = s.ToCharArray();
            StringBuilder builder = new StringBuilder(2048);

            for (int i = 0; i < 2048; i++)
            {
                builder.Append(chars[Random.Next(chars.Length)]);
            }
            string content = builder.ToString();

            SnapshotDeletionPolicy sdp = new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
            Directory         dir      = new NRTCachingDirectory(NewFSDirectory(CreateTempDir("nrt")), 5, 128);
            IndexWriterConfig config   = new IndexWriterConfig(
#pragma warning disable 612, 618
                Version.LUCENE_46,
#pragma warning restore 612, 618
                new MockAnalyzer(Random));

            config.SetIndexDeletionPolicy(sdp);
            config.SetOpenMode(OpenMode.CREATE_OR_APPEND);
            IndexWriter         iw  = new IndexWriter(dir, config);
            SearcherManager     sm  = new SearcherManager(iw, true, new SearcherFactory());
            TrackingIndexWriter tiw = new TrackingIndexWriter(iw);
            ControlledRealTimeReopenThread <IndexSearcher> controlledRealTimeReopenThread =
                new ControlledRealTimeReopenThread <IndexSearcher>(tiw, sm, maxStaleSecs, 0);

            controlledRealTimeReopenThread.IsBackground = (true);
            controlledRealTimeReopenThread.Start();

            IList <ThreadJob> commitThreads = new JCG.List <ThreadJob>();

            for (int i = 0; i < 500; i++)
            {
                if (i > 0 && i % 50 == 0)
                {
                    ThreadJob commitThread = new RunnableAnonymousClass(this, sdp, dir, iw);
                    commitThread.Start();
                    commitThreads.Add(commitThread);
                }
                Document d = new Document();
                d.Add(new TextField("count", i + "", Field.Store.NO));
                d.Add(new TextField("content", content, Field.Store.YES));
                long start = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results
                long l     = tiw.AddDocument(d);
                controlledRealTimeReopenThread.WaitForGeneration(l);
                long wait = (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond) - start; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results
                assertTrue("waited too long for generation " + wait, wait < (maxStaleSecs * 1000));
                IndexSearcher searcher = sm.Acquire();
                TopDocs       td       = searcher.Search(new TermQuery(new Term("count", i + "")), 10);
                sm.Release(searcher);
                assertEquals(1, td.TotalHits);
            }

            foreach (ThreadJob commitThread in commitThreads)
            {
                commitThread.Join();
            }

            controlledRealTimeReopenThread.Dispose();
            sm.Dispose();
            iw.Dispose();
            dir.Dispose();
        }
Esempio n. 18
0
            public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
            {
                IList <Scorer> required           = new JCG.List <Scorer>();
                IList <Scorer> prohibited         = new JCG.List <Scorer>();
                IList <Scorer> optional           = new JCG.List <Scorer>();
                IEnumerator <BooleanClause> cIter = outerInstance.clauses.GetEnumerator();

                foreach (Weight w in m_weights)
                {
                    cIter.MoveNext();
                    BooleanClause c         = cIter.Current;
                    Scorer        subScorer = w.GetScorer(context, acceptDocs);
                    if (subScorer == null)
                    {
                        if (c.IsRequired)
                        {
                            return(null);
                        }
                    }
                    else if (c.IsRequired)
                    {
                        required.Add(subScorer);
                    }
                    else if (c.IsProhibited)
                    {
                        prohibited.Add(subScorer);
                    }
                    else
                    {
                        optional.Add(subScorer);
                    }
                }

                if (required.Count == 0 && optional.Count == 0)
                {
                    // no required and optional clauses.
                    return(null);
                }
                else if (optional.Count < outerInstance.m_minNrShouldMatch)
                {
                    // either >1 req scorer, or there are 0 req scorers and at least 1
                    // optional scorer. Therefore if there are not enough optional scorers
                    // no documents will be matched by the query
                    return(null);
                }

                // simple conjunction
                if (optional.Count == 0 && prohibited.Count == 0)
                {
                    float coord = disableCoord ? 1.0f : Coord(required.Count, m_maxCoord);
                    return(new ConjunctionScorer(this, required.ToArray(), coord));
                }

                // simple disjunction
                if (required.Count == 0 && prohibited.Count == 0 && outerInstance.m_minNrShouldMatch <= 1 && optional.Count > 1)
                {
                    var coord = new float[optional.Count + 1];
                    for (int i = 0; i < coord.Length; i++)
                    {
                        coord[i] = disableCoord ? 1.0f : Coord(i, m_maxCoord);
                    }
                    return(new DisjunctionSumScorer(this, optional.ToArray(), coord));
                }

                // Return a BooleanScorer2
                return(new BooleanScorer2(this, disableCoord, outerInstance.m_minNrShouldMatch, required, prohibited, optional, m_maxCoord));
            }
Esempio n. 19
0
        /// <summary>
        /// Low level api. Returns a token stream generated from a <see cref="Terms"/>. This
        /// can be used to feed the highlighter with a pre-parsed token
        /// stream.  The <see cref="Terms"/> must have offsets available.
        /// <para/>
        /// In my tests the speeds to recreate 1000 token streams using this method are:
        /// <list type="bullet">
        ///     <item><description>
        ///     with TermVector offset only data stored - 420  milliseconds
        ///     </description></item>
        ///     <item><description>
        ///     with TermVector offset AND position data stored - 271 milliseconds
        ///     (nb timings for TermVector with position data are based on a tokenizer with contiguous
        ///     positions - no overlaps or gaps)
        ///     </description></item>
        ///     <item><description>
        ///     The cost of not using TermPositionVector to store
        ///     pre-parsed content and using an analyzer to re-parse the original content:
        ///     - reanalyzing the original content - 980 milliseconds
        ///     </description></item>
        /// </list>
        ///
        /// The re-analyze timings will typically vary depending on -
        /// <list type="number">
        ///     <item><description>
        ///     The complexity of the analyzer code (timings above were using a
        ///     stemmer/lowercaser/stopword combo)
        ///     </description></item>
        ///     <item><description>
        ///     The  number of other fields (Lucene reads ALL fields off the disk
        ///     when accessing just one document field - can cost dear!)
        ///     </description></item>
        ///     <item><description>
        ///     Use of compression on field storage - could be faster due to compression (less disk IO)
        ///     or slower (more CPU burn) depending on the content.
        ///     </description></item>
        /// </list>
        /// </summary>
        /// <param name="tpv"></param>
        /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
        /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param>
        /// <exception cref="ArgumentException">if no offsets are available</exception>
        public static TokenStream GetTokenStream(Terms tpv,
                                                 bool tokenPositionsGuaranteedContiguous)
        {
            if (!tpv.HasOffsets)
            {
                throw new ArgumentException("Cannot create TokenStream from Terms without offsets");
            }

            if (!tokenPositionsGuaranteedContiguous && tpv.HasPositions)
            {
                return(new TokenStreamFromTermPositionVector(tpv));
            }

            bool hasPayloads = tpv.HasPayloads;

            // code to reconstruct the original sequence of Tokens
            TermsEnum termsEnum   = tpv.GetEnumerator();
            int       totalTokens = 0;

            while (termsEnum.MoveNext())
            {
                totalTokens += (int)termsEnum.TotalTermFreq;
            }
            Token[]          tokensInOriginalOrder = new Token[totalTokens];
            JCG.List <Token> unsortedTokens        = null;
            termsEnum = tpv.GetEnumerator();
            DocsAndPositionsEnum dpEnum = null;

            while (termsEnum.MoveNext())
            {
                dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
                if (dpEnum is null)
                {
                    throw new ArgumentException("Required TermVector Offset information was not found");
                }
                string term = termsEnum.Term.Utf8ToString();

                dpEnum.NextDoc();
                int freq = dpEnum.Freq;
                for (int posUpto = 0; posUpto < freq; posUpto++)
                {
                    int pos = dpEnum.NextPosition();
                    if (dpEnum.StartOffset < 0)
                    {
                        throw new ArgumentException("Required TermVector Offset information was not found");
                    }
                    Token token = new Token(term, dpEnum.StartOffset, dpEnum.EndOffset);
                    if (hasPayloads)
                    {
                        // Must make a deep copy of the returned payload,
                        // since D&PEnum API is allowed to re-use on every
                        // call:
                        token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload());
                    }

                    if (tokenPositionsGuaranteedContiguous && pos != -1)
                    {
                        // We have positions stored and a guarantee that the token position
                        // information is contiguous

                        // This may be fast BUT wont work if Tokenizers used which create >1
                        // token in same position or
                        // creates jumps in position numbers - this code would fail under those
                        // circumstances

                        // tokens stored with positions - can use this to index straight into
                        // sorted array
                        tokensInOriginalOrder[pos] = token;
                    }
                    else
                    {
                        // tokens NOT stored with positions or not guaranteed contiguous - must
                        // add to list and sort later
                        if (unsortedTokens is null)
                        {
                            unsortedTokens = new JCG.List <Token>();
                        }
                        unsortedTokens.Add(token);
                    }
                }
            }

            // If the field has been stored without position data we must perform a sort
            if (unsortedTokens != null)
            {
                tokensInOriginalOrder = unsortedTokens.ToArray();
                ArrayUtil.TimSort(tokensInOriginalOrder, TokenComparer.Default);
                //tokensInOriginalOrder = tokensInOriginalOrder
                //    .OrderBy(t => t, new TokenComparer() )
                //    .ToArray();
            }
            return(new StoredTokenStream(tokensInOriginalOrder));
        }
Esempio n. 20
0
            public override Query Rewrite(IndexReader reader)
            {
                // ArrayList spanClauses = new ArrayList();
                if (contents is TermQuery)
                {
                    return(contents);
                }
                // Build a sequence of Span clauses arranged in a SpanNear - child
                // clauses can be complex
                // Booleans e.g. nots and ors etc
                int numNegatives = 0;

                if (!(contents is BooleanQuery))
                {
                    throw new ArgumentException("Unknown query type \""
                                                + contents.GetType().Name
                                                + "\" found in phrase query string \"" + phrasedQueryStringContents
                                                + "\"");
                }
                BooleanQuery bq = (BooleanQuery)contents;

                BooleanClause[] bclauses       = bq.GetClauses();
                SpanQuery[]     allSpanClauses = new SpanQuery[bclauses.Length];
                // For all clauses e.g. one* two~
                for (int i = 0; i < bclauses.Length; i++)
                {
                    // HashSet bclauseterms=new HashSet();
                    Query qc = bclauses[i].Query;
                    // Rewrite this clause e.g one* becomes (one OR onerous)
                    qc = qc.Rewrite(reader);
                    if (bclauses[i].Occur.Equals(Occur.MUST_NOT))
                    {
                        numNegatives++;
                    }

                    if (qc is BooleanQuery booleanQuery)
                    {
                        IList <SpanQuery> sc = new JCG.List <SpanQuery>();
                        AddComplexPhraseClause(sc, booleanQuery);
                        if (sc.Count > 0)
                        {
                            allSpanClauses[i] = sc[0];
                        }
                        else
                        {
                            // Insert fake term e.g. phrase query was for "Fred Smithe*" and
                            // there were no "Smithe*" terms - need to
                            // prevent match on just "Fred".
                            allSpanClauses[i] = new SpanTermQuery(new Term(field,
                                                                           "Dummy clause because no terms found - must match nothing"));
                        }
                    }
                    else
                    {
                        if (qc is TermQuery tq)
                        {
                            allSpanClauses[i] = new SpanTermQuery(tq.Term);
                        }
                        else
                        {
                            throw new ArgumentException("Unknown query type \""
                                                        + qc.GetType().Name
                                                        + "\" found in phrase query string \""
                                                        + phrasedQueryStringContents + "\"");
                        }
                    }
                }
                if (numNegatives == 0)
                {
                    // The simple case - no negative elements in phrase
                    return(new SpanNearQuery(allSpanClauses, slopFactor, inOrder));
                }
                // Complex case - we have mixed positives and negatives in the
                // sequence.
                // Need to return a SpanNotQuery
                JCG.List <SpanQuery> positiveClauses = new JCG.List <SpanQuery>();
                for (int j = 0; j < allSpanClauses.Length; j++)
                {
                    if (!bclauses[j].Occur.Equals(Occur.MUST_NOT))
                    {
                        positiveClauses.Add(allSpanClauses[j]);
                    }
                }

                SpanQuery[] includeClauses = positiveClauses
                                             .ToArray();

                SpanQuery include; // LUCENENET: IDE0059: Remove unnecessary value assignment

                if (includeClauses.Length == 1)
                {
                    include = includeClauses[0]; // only one positive clause
                }
                else
                {
                    // need to increase slop factor based on gaps introduced by
                    // negatives
                    include = new SpanNearQuery(includeClauses, slopFactor + numNegatives,
                                                inOrder);
                }
                // Use sequence of positive and negative values as the exclude.
                SpanNearQuery exclude = new SpanNearQuery(allSpanClauses, slopFactor,
                                                          inOrder);
                SpanNotQuery snot = new SpanNotQuery(include, exclude);

                return(snot);
            }
Esempio n. 21
0
        public override MergeSpecification FindForcedMerges(SegmentInfos segmentInfos, int maxSegmentCount, IDictionary <SegmentCommitInfo, bool> segmentsToMerge)
        {
            // first find all old segments
            IDictionary <SegmentCommitInfo, bool> oldSegments = new Dictionary <SegmentCommitInfo, bool>();

            foreach (SegmentCommitInfo si in segmentInfos.Segments)
            {
                if (segmentsToMerge.TryGetValue(si, out bool v) && ShouldUpgradeSegment(si))
                {
                    oldSegments[si] = v;
                }
            }

            if (Verbose())
            {
                Message("findForcedMerges: segmentsToUpgrade=" + oldSegments);
            }

            if (oldSegments.Count == 0)
            {
                return(null);
            }

            MergeSpecification spec = m_base.FindForcedMerges(segmentInfos, maxSegmentCount, oldSegments);

            if (spec != null)
            {
                // remove all segments that are in merge specification from oldSegments,
                // the resulting set contains all segments that are left over
                // and will be merged to one additional segment:
                foreach (OneMerge om in spec.Merges)
                {
                    foreach (SegmentCommitInfo sipc in om.Segments)
                    {
                        oldSegments.Remove(sipc);
                    }
                }
            }

            if (oldSegments.Count > 0)
            {
                if (Verbose())
                {
                    Message("findForcedMerges: " + m_base.GetType().Name + " does not want to merge all old segments, merge remaining ones into new segment: " + oldSegments);
                }
                IList <SegmentCommitInfo> newInfos = new JCG.List <SegmentCommitInfo>();
                foreach (SegmentCommitInfo si in segmentInfos.Segments)
                {
                    if (oldSegments.ContainsKey(si))
                    {
                        newInfos.Add(si);
                    }
                }
                // add the final merge
                if (spec == null)
                {
                    spec = new MergeSpecification();
                }
                spec.Add(new OneMerge(newInfos));
            }

            return(spec);
        }
Esempio n. 22
0
        public Int64RangeCounter(Int64Range[] ranges)
        {
            // Maps all range inclusive endpoints to int flags; 1
            // = start of interval, 2 = end of interval.  We need to
            // track the start vs end case separately because if a
            // given point is both, then it must be its own
            // elementary interval:
            IDictionary <long, int> endsMap = new Dictionary <long, int>
            {
                [long.MinValue] = 1,
                [long.MaxValue] = 2
            };

            foreach (Int64Range range in ranges)
            {
                if (!endsMap.TryGetValue(range.minIncl, out int cur))
                {
                    endsMap[range.minIncl] = 1;
                }
                else
                {
                    endsMap[range.minIncl] = cur | 1;
                }

                if (!endsMap.TryGetValue(range.maxIncl, out cur))
                {
                    endsMap[range.maxIncl] = 2;
                }
                else
                {
                    endsMap[range.maxIncl] = cur | 2;
                }
            }

            var endsList = new JCG.List <long>(endsMap.Keys);

            endsList.Sort();

            // Build elementaryIntervals (a 1D Venn diagram):
            IList <InclusiveRange> elementaryIntervals = new JCG.List <InclusiveRange>();
            int  upto0 = 1;
            long v     = endsList[0];
            long prev;

            if (endsMap[v] == 3)
            {
                elementaryIntervals.Add(new InclusiveRange(v, v));
                prev = v + 1;
            }
            else
            {
                prev = v;
            }

            while (upto0 < endsList.Count)
            {
                v = endsList[upto0];
                int flags = endsMap[v];
                //System.out.println("  v=" + v + " flags=" + flags);
                if (flags == 3)
                {
                    // This point is both an end and a start; we need to
                    // separate it:
                    if (v > prev)
                    {
                        elementaryIntervals.Add(new InclusiveRange(prev, v - 1));
                    }
                    elementaryIntervals.Add(new InclusiveRange(v, v));
                    prev = v + 1;
                }
                else if (flags == 1)
                {
                    // This point is only the start of an interval;
                    // attach it to next interval:
                    if (v > prev)
                    {
                        elementaryIntervals.Add(new InclusiveRange(prev, v - 1));
                    }
                    prev = v;
                }
                else
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(flags == 2);
                    }
                    // This point is only the end of an interval; attach
                    // it to last interval:
                    elementaryIntervals.Add(new InclusiveRange(prev, v));
                    prev = v + 1;
                }
                //System.out.println("    ints=" + elementaryIntervals);
                upto0++;
            }

            // Build binary tree on top of intervals:
            root = Split(0, elementaryIntervals.Count, elementaryIntervals);

            // Set outputs, so we know which range to output for
            // each node in the tree:
            for (int i = 0; i < ranges.Length; i++)
            {
                root.AddOutputs(i, ranges[i]);
            }

            // Set boundaries (ends of each elementary interval):
            boundaries = new long[elementaryIntervals.Count];
            for (int i = 0; i < boundaries.Length; i++)
            {
                boundaries[i] = elementaryIntervals[i].End;
            }

            leafCounts = new int[boundaries.Length];

            //System.out.println("ranges: " + Arrays.toString(ranges));
            //System.out.println("intervals: " + elementaryIntervals);
            //System.out.println("boundaries: " + Arrays.toString(boundaries));
            //System.out.println("root:\n" + root);
        }
Esempio n. 23
0
 /// <summary>
 /// Query should be rewritten for wild/fuzzy support.
 /// </summary>
 /// <param name="query"> rewritten query </param>
 /// <returns> payloads Collection </returns>
 /// <exception cref="IOException"> if there is a low-level I/O error </exception>
 public virtual ICollection<byte[]> GetPayloadsForQuery(Query query)
 {
     var payloads = new JCG.List<byte[]>();
     QueryToSpanQuery(query, payloads);
     return payloads;
 }
Esempio n. 24
0
        public override void Flush(IDictionary <string, TermsHashConsumerPerField> fieldsToFlush, SegmentWriteState state)
        {
            // Gather all FieldData's that have postings, across all
            // ThreadStates
            IList <FreqProxTermsWriterPerField> allFields = new JCG.List <FreqProxTermsWriterPerField>();

            foreach (TermsHashConsumerPerField f in fieldsToFlush.Values)
            {
                FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField)f;
                if (perField.termsHashPerField.bytesHash.Count > 0)
                {
                    allFields.Add(perField);
                }
            }

            int numAllFields = allFields.Count;

            // Sort by field name
            CollectionUtil.IntroSort(allFields);

            FieldsConsumer consumer = state.SegmentInfo.Codec.PostingsFormat.FieldsConsumer(state);

            bool success = false;

            try
            {
                TermsHash termsHash = null;

                /*
                 * Current writer chain:
                 * FieldsConsumer
                 * -> IMPL: FormatPostingsTermsDictWriter
                 *  -> TermsConsumer
                 *    -> IMPL: FormatPostingsTermsDictWriter.TermsWriter
                 *      -> DocsConsumer
                 *        -> IMPL: FormatPostingsDocsWriter
                 *          -> PositionsConsumer
                 *            -> IMPL: FormatPostingsPositionsWriter
                 */

                for (int fieldNumber = 0; fieldNumber < numAllFields; fieldNumber++)
                {
                    FieldInfo fieldInfo = allFields[fieldNumber].fieldInfo;

                    FreqProxTermsWriterPerField fieldWriter = allFields[fieldNumber];

                    // If this field has postings then add them to the
                    // segment
                    fieldWriter.Flush(fieldInfo.Name, consumer, state);

                    TermsHashPerField perField = fieldWriter.termsHashPerField;
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(termsHash == null || termsHash == perField.termsHash);
                    }
                    termsHash = perField.termsHash;
                    int numPostings = perField.bytesHash.Count;
                    perField.Reset();
                    perField.ShrinkHash(/* numPostings // LUCENENET: Not used */);
                    fieldWriter.Reset();
                }

                if (termsHash != null)
                {
                    termsHash.Reset();
                }
                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(consumer);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(consumer);
                }
            }
        }
Esempio n. 25
0
        public static void Main(string[] args)
        {
            if (args.Length < 5)
            {
                // LUCENENET specific - our wrapper console shows the correct usage
                throw new ArgumentException();
                //Console.Error.WriteLine("Usage: MultiPassIndexSplitter -out <outputDir> -num <numParts> [-seq] <inputIndex1> [<inputIndex2 ...]");
                //Console.Error.WriteLine("\tinputIndex\tpath to input index, multiple values are ok");
                //Console.Error.WriteLine("\t-out ouputDir\tpath to output directory to contain partial indexes");
                //Console.Error.WriteLine("\t-num numParts\tnumber of parts to produce");
                //Console.Error.WriteLine("\t-seq\tsequential docid-range split (default is round-robin)");
                //Environment.Exit(-1);
            }
            IList <IndexReader> indexes = new JCG.List <IndexReader>();

            try
            {
                string outDir   = null;
                int    numParts = -1;
                bool   seq      = false;
                for (int i = 0; i < args.Length; i++)
                {
                    if (args[i].Equals("-out", StringComparison.Ordinal))
                    {
                        outDir = args[++i];
                    }
                    else if (args[i].Equals("-num", StringComparison.Ordinal))
                    {
                        numParts = Convert.ToInt32(args[++i], CultureInfo.InvariantCulture);
                    }
                    else if (args[i].Equals("-seq", StringComparison.Ordinal))
                    {
                        seq = true;
                    }
                    else
                    {
                        DirectoryInfo file = new DirectoryInfo(args[i]);
                        if (!file.Exists)
                        {
                            Console.Error.WriteLine("Invalid input path - skipping: " + file);
                            continue;
                        }
                        using Store.Directory dir = FSDirectory.Open(new DirectoryInfo(args[i]));
                        try
                        {
                            if (!DirectoryReader.IndexExists(dir))
                            {
                                Console.Error.WriteLine("Invalid input index - skipping: " + file);
                                continue;
                            }
                        }
                        catch (Exception e) when(e.IsException())
                        {
                            Console.Error.WriteLine("Invalid input index - skipping: " + file);
                            continue;
                        }
                        indexes.Add(DirectoryReader.Open(dir));
                    }
                }
                if (outDir is null)
                {
                    throw new Exception("Required argument missing: -out outputDir");
                }
                if (numParts < 2)
                {
                    throw new Exception("Invalid value of required argument: -num numParts");
                }
                if (indexes.Count == 0)
                {
                    throw new Exception("No input indexes to process");
                }
                DirectoryInfo @out = new DirectoryInfo(outDir);
                @out.Create();
                if (!new DirectoryInfo(outDir).Exists)
                {
                    throw new Exception("Can't create output directory: " + @out);
                }
                Store.Directory[] dirs = new Store.Directory[numParts];
                try
                {
                    for (int i = 0; i < numParts; i++)
                    {
                        dirs[i] = FSDirectory.Open(new DirectoryInfo(Path.Combine(@out.FullName, "part-" + i)));
                    }
                    MultiPassIndexSplitter splitter = new MultiPassIndexSplitter();
                    IndexReader            input;
                    if (indexes.Count == 1)
                    {
                        input = indexes[0];
                    }
                    else
                    {
                        input = new MultiReader(indexes.ToArray());
                    }
#pragma warning disable 612, 618
                    splitter.Split(LuceneVersion.LUCENE_CURRENT, input, dirs, seq);
#pragma warning restore 612, 618
                }
                finally
                {
                    // LUCENENET specific - properly dispose directories to prevent resource leaks
                    IOUtils.Dispose(dirs);
                }
            }
            finally
            {
                // LUCENENET specific - properly dispose index readers to prevent resource leaks
                IOUtils.Dispose(indexes);
            }
        }
Esempio n. 26
0
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num)
        {
            // LUCENENET: Added guard clause for null
            if (key is null)
            {
                throw new ArgumentNullException(nameof(key));
            }

            if (contexts != null)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.GetTokenStream("", key);

            try
            {
                ITermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                IOffsetAttribute            offsetAtt    = ts.AddAttribute <IOffsetAttribute>();
                IPositionLengthAttribute    posLenAtt    = ts.AddAttribute <IPositionLengthAttribute>();
                IPositionIncrementAttribute posIncAtt    = ts.AddAttribute <IPositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(gramCount <= grams);
                    }

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset);
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset);

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token is null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <Int64>();

                var bytesReader = fst.GetBytesReader();

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                JCG.List <LookupResult> results = new JCG.List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new JCG.HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token is null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    Int64 prefixOutput = null;
                    try
                    {
                        prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    }
                    catch (Exception bogus) when(bogus.IsIOException())
                    {
                        throw RuntimeException.Create(bogus);
                    }
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput is null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef()));
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(output != null);
                            }
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment is null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(finalLastToken.Offset == 0);
                    }

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <Int64> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <Int64> searcher = new TopNSearcherAnonymousClass(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef());

                        completions = searcher.Search();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(completions.IsComplete);
                        }
                    }
                    catch (Exception bogus) when(bogus.IsIOException())
                    {
                        throw RuntimeException.Create(bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <Int64> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                if (Debugging.AssertsEnabled)
                                {
                                    Debugging.Assert(token.Length - i - 1 > 0);
                                }
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(),
                                                               // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
                                                               // return numbers that are greater than long.MaxValue, which results in a negative long number.
                                                               (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(results.Count == seen.Count);
                        }
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    backoff *= ALPHA;
                }

                results.Sort(Comparer <Lookup.LookupResult> .Create((a, b) =>
                {
                    if (a.Value > b.Value)
                    {
                        return(-1);
                    }
                    else if (a.Value < b.Value)
                    {
                        return(1);
                    }
                    else
                    {
                        // Tie break by UTF16 sort order:
                        return(a.Key.CompareToOrdinal(b.Key));
                    }
                }));

                if (results.Count > num)
                {
                    results.RemoveRange(num, results.Count - num); // LUCENENET: Converted end index to length
                }

                return(results);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Esempio n. 27
0
        public static IList <string> SplitWS(string s, bool decode)
        {
            IList <string> lst = new JCG.List <string>(2);
            StringBuilder  sb = new StringBuilder();
            int            pos = 0, end = s.Length;

            while (pos < end)
            {
                char ch = s[pos++];
                if (char.IsWhiteSpace(ch))
                {
                    if (sb.Length > 0)
                    {
                        lst.Add(sb.ToString());
                        sb = new StringBuilder();
                    }
                    continue;
                }

                if (ch == '\\')
                {
                    if (!decode)
                    {
                        sb.Append(ch);
                    }
                    if (pos >= end) // ERROR, or let it go?
                    {
                        break;
                    }
                    ch = s[pos++];
                    if (decode)
                    {
                        switch (ch)
                        {
                        case 'n':
                            ch = '\n';
                            break;

                        case 't':
                            ch = '\t';
                            break;

                        case 'r':
                            ch = '\r';
                            break;

                        case 'b':
                            ch = '\b';
                            break;

                        case 'f':
                            ch = '\f';
                            break;
                        }
                    }
                }

                sb.Append(ch);
            }

            if (sb.Length > 0)
            {
                lst.Add(sb.ToString());
            }

            return(lst);
        }
Esempio n. 28
0
        public virtual void TestBS2DisjunctionNextVsAdvance()
        {
            Directory         d = NewDirectory();
            RandomIndexWriter w = new RandomIndexWriter(Random, d);
            int numDocs         = AtLeast(300);

            for (int docUpto = 0; docUpto < numDocs; docUpto++)
            {
                string contents = "a";
                if (Random.Next(20) <= 16)
                {
                    contents += " b";
                }
                if (Random.Next(20) <= 8)
                {
                    contents += " c";
                }
                if (Random.Next(20) <= 4)
                {
                    contents += " d";
                }
                if (Random.Next(20) <= 2)
                {
                    contents += " e";
                }
                if (Random.Next(20) <= 1)
                {
                    contents += " f";
                }
                Document doc = new Document();
                doc.Add(new TextField("field", contents, Field.Store.NO));
                w.AddDocument(doc);
            }
            w.ForceMerge(1);
            IndexReader   r = w.GetReader();
            IndexSearcher s = NewSearcher(r);

            w.Dispose();

            for (int iter = 0; iter < 10 * RandomMultiplier; iter++)
            {
                if (Verbose)
                {
                    Console.WriteLine("iter=" + iter);
                }
                IList <string> terms = new JCG.List <string> {
                    "a", "b", "c", "d", "e", "f"
                };
                int numTerms = TestUtil.NextInt32(Random, 1, terms.Count);
                while (terms.Count > numTerms)
                {
                    terms.RemoveAt(Random.Next(terms.Count));
                }

                if (Verbose)
                {
                    Console.WriteLine("  terms=" + terms);
                }

                BooleanQuery q = new BooleanQuery();
                foreach (string term in terms)
                {
                    q.Add(new BooleanClause(new TermQuery(new Term("field", term)), Occur.SHOULD));
                }

                Weight weight = s.CreateNormalizedWeight(q);

                Scorer scorer = weight.GetScorer(s.m_leafContexts[0], null);

                // First pass: just use .NextDoc() to gather all hits
                IList <ScoreDoc> hits = new JCG.List <ScoreDoc>();
                while (scorer.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
                {
                    hits.Add(new ScoreDoc(scorer.DocID, scorer.GetScore()));
                }

                if (Verbose)
                {
                    Console.WriteLine("  " + hits.Count + " hits");
                }

                // Now, randomly next/advance through the list and
                // verify exact match:
                for (int iter2 = 0; iter2 < 10; iter2++)
                {
                    weight = s.CreateNormalizedWeight(q);
                    scorer = weight.GetScorer(s.m_leafContexts[0], null);

                    if (Verbose)
                    {
                        Console.WriteLine("  iter2=" + iter2);
                    }

                    int upto = -1;
                    while (upto < hits.Count)
                    {
                        int nextUpto;
                        int nextDoc;
                        int left = hits.Count - upto;
                        if (left == 1 || Random.nextBoolean())
                        {
                            // next
                            nextUpto = 1 + upto;
                            nextDoc  = scorer.NextDoc();
                        }
                        else
                        {
                            // advance
                            int inc = TestUtil.NextInt32(Random, 1, left - 1);
                            nextUpto = inc + upto;
                            nextDoc  = scorer.Advance(hits[nextUpto].Doc);
                        }

                        if (nextUpto == hits.Count)
                        {
                            Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, nextDoc);
                        }
                        else
                        {
                            ScoreDoc hit = hits[nextUpto];
                            Assert.AreEqual(hit.Doc, nextDoc);

                            // LUCENENET: For some weird reason, on x86 in .NET Framework (optimizations enabled), using == (as they did in Lucene) doesn't work with optimizations enabled, but using AreEqual with epsilon of 0f does.
                            // Test for precise float equality:
                            Assert.AreEqual(hit.Score, scorer.GetScore(), 0f, "doc " + hit.Doc + " has wrong score: expected=" + hit.Score + " actual=" + scorer.GetScore());
                        }
                        upto = nextUpto;
                    }
                }
            }

            r.Dispose();
            d.Dispose();
        }
Esempio n. 29
0
        /// <summary>
        /// Auto-completes a given prefix query using Depth-First Search with the end
        /// of prefix as source node each time finding a new leaf to get a complete key
        /// to be added in the suggest list.
        /// </summary>
        /// <param name="root">
        ///          a reference to root node of TST. </param>
        /// <param name="s">
        ///          prefix query to be auto-completed. </param>
        /// <param name="x">
        ///          index of current character to be searched while traversing through
        ///          the prefix in TST. </param>
        /// <returns> suggest list of auto-completed keys for the given prefix query. </returns>
        public virtual IList <TernaryTreeNode> PrefixCompletion(TernaryTreeNode root, string s, int x)
        {
            TernaryTreeNode p = root;

            JCG.List <TernaryTreeNode> suggest = new JCG.List <TernaryTreeNode>();

            while (p != null)
            {
                if (s[x] < p.splitchar)
                {
                    p = p.loKid;
                }
                else if (s[x] == p.splitchar)
                {
                    if (x == s.Length - 1)
                    {
                        break;
                    }
                    else
                    {
                        x++;
                    }
                    p = p.eqKid;
                }
                else
                {
                    p = p.hiKid;
                }
            }

            if (p == null)
            {
                return(suggest);
            }
            if (p.eqKid == null && p.token == null)
            {
                return(suggest);
            }
            if (p.eqKid == null && p.token != null)
            {
                suggest.Add(p);
                return(suggest);
            }

            if (p.token != null)
            {
                suggest.Add(p);
            }
            p = p.eqKid;

            var st = new Stack <TernaryTreeNode>();

            st.Push(p);
            while (st.Count > 0)
            {
                TernaryTreeNode top = st.Peek();
                st.Pop();
                if (top.token != null)
                {
                    suggest.Add(top);
                }
                if (top.eqKid != null)
                {
                    st.Push(top.eqKid);
                }
                if (top.loKid != null)
                {
                    st.Push(top.loKid);
                }
                if (top.hiKid != null)
                {
                    st.Push(top.hiKid);
                }
            }
            return(suggest);
        }
Esempio n. 30
0
        /// <summary>
        /// Call this only once (if you subclass!) </summary>
        protected virtual void Uninvert(AtomicReader reader, IBits liveDocs, BytesRef termPrefix)
        {
            FieldInfo info = reader.FieldInfos.FieldInfo(m_field);

            if (info != null && info.HasDocValues)
            {
                throw IllegalStateException.Create("Type mismatch: " + m_field + " was indexed as " + info.DocValuesType);
            }
            //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
            long startTime = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results

            m_prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix);

            int maxDoc = reader.MaxDoc;

            int[] index    = new int[maxDoc];     // immediate term numbers, or the index into the byte[] representing the last number
            int[] lastTerm = new int[maxDoc];     // last term we saw for this document
            var   bytes    = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)

            Fields fields = reader.Fields;

            if (fields == null)
            {
                // No terms
                return;
            }
            Terms terms = fields.GetTerms(m_field);

            if (terms == null)
            {
                // No terms
                return;
            }

            TermsEnum te        = terms.GetEnumerator();
            BytesRef  seekStart = termPrefix ?? new BytesRef();

            //System.out.println("seekStart=" + seekStart.utf8ToString());
            if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END)
            {
                // No terms match
                return;
            }

            // If we need our "term index wrapper", these will be
            // init'd below:
            IList <BytesRef> indexedTerms      = null;
            PagedBytes       indexedTermsBytes = null;

            bool testedOrd = false;

            // we need a minimum of 9 bytes, but round up to 12 since the space would
            // be wasted with most allocators anyway.
            var tempArr = new sbyte[12];

            //
            // enumerate all terms, and build an intermediate form of the un-inverted field.
            //
            // During this intermediate form, every document has a (potential) byte[]
            // and the int[maxDoc()] array either contains the termNumber list directly
            // or the *end* offset of the termNumber list in it's byte array (for faster
            // appending and faster creation of the final form).
            //
            // idea... if things are too large while building, we could do a range of docs
            // at a time (but it would be a fair amount slower to build)
            // could also do ranges in parallel to take advantage of multiple CPUs

            // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
            // values.  this requires going over the field first to find the most
            // frequent terms ahead of time.

            int termNum = 0;

            m_docsEnum = null;

            // Loop begins with te positioned to first term (we call
            // seek above):
            for (; ;)
            {
                BytesRef t = te.Term;
                if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix)))
                {
                    break;
                }
                //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

                if (!testedOrd)
                {
                    try
                    {
                        m_ordBase = (int)te.Ord;
                        //System.out.println("got ordBase=" + ordBase);
                    }
                    catch (Exception uoe) when(uoe.IsUnsupportedOperationException())
                    {
                        // Reader cannot provide ord support, so we wrap
                        // our own support by creating our own terms index:
                        indexedTerms      = new JCG.List <BytesRef>();
                        indexedTermsBytes = new PagedBytes(15);
                        //System.out.println("NO ORDS");
                    }
                    testedOrd = true;
                }

                VisitTerm(te, termNum);

                if (indexedTerms != null && (termNum & indexIntervalMask) == 0)
                {
                    // Index this term
                    m_sizeOfIndexedStrings += t.Length;
                    BytesRef indexedTerm = new BytesRef();
                    indexedTermsBytes.Copy(t, indexedTerm);
                    // TODO: really should 1) strip off useless suffix,
                    // and 2) use FST not array/PagedBytes
                    indexedTerms.Add(indexedTerm);
                }

                int df = te.DocFreq;
                if (df <= m_maxTermDocFreq)
                {
                    m_docsEnum = te.Docs(liveDocs, m_docsEnum, DocsFlags.NONE);

                    // dF, but takes deletions into account
                    int actualDF = 0;

                    for (; ;)
                    {
                        int doc = m_docsEnum.NextDoc();
                        if (doc == DocIdSetIterator.NO_MORE_DOCS)
                        {
                            break;
                        }
                        //System.out.println("  chunk=" + chunk + " docs");

                        actualDF++;
                        m_termInstances++;

                        //System.out.println("    docID=" + doc);
                        // add TNUM_OFFSET to the term number to make room for special reserved values:
                        // 0 (end term) and 1 (index into byte array follows)
                        int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
                        lastTerm[doc] = termNum;
                        int val = index[doc];

                        if ((val & 0xff) == 1)
                        {
                            // index into byte array (actually the end of
                            // the doc-specific byte[] when building)
                            int pos    = val.TripleShift(8);
                            int ilen   = VInt32Size(delta);
                            var arr    = bytes[doc];
                            int newend = pos + ilen;
                            if (newend > arr.Length)
                            {
                                // We avoid a doubling strategy to lower memory usage.
                                // this faceting method isn't for docs with many terms.
                                // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
                                // TODO: figure out what array lengths we can round up to w/o actually using more memory
                                // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
                                // It should be safe to round up to the nearest 32 bits in any case.
                                int newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment
                                var newarr = new sbyte[newLen];
                                Array.Copy(arr, 0, newarr, 0, pos);
                                arr        = newarr;
                                bytes[doc] = newarr;
                            }
                            pos        = WriteInt32(delta, arr, pos);
                            index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
                        }
                        else
                        {
                            // OK, this int has data in it... find the end (a zero starting byte - not
                            // part of another number, hence not following a byte with the high bit set).
                            int ipos;
                            if (val == 0)
                            {
                                ipos = 0;
                            }
                            else if ((val & 0x0000ff80) == 0)
                            {
                                ipos = 1;
                            }
                            else if ((val & 0x00ff8000) == 0)
                            {
                                ipos = 2;
                            }
                            else if ((val & 0xff800000) == 0)
                            {
                                ipos = 3;
                            }
                            else
                            {
                                ipos = 4;
                            }

                            //System.out.println("      ipos=" + ipos);

                            int endPos = WriteInt32(delta, tempArr, ipos);
                            //System.out.println("      endpos=" + endPos);
                            if (endPos <= 4)
                            {
                                //System.out.println("      fits!");
                                // value will fit in the integer... move bytes back
                                for (int j = ipos; j < endPos; j++)
                                {
                                    val |= (tempArr[j] & 0xff) << (j << 3);
                                }
                                index[doc] = val;
                            }
                            else
                            {
                                // value won't fit... move integer into byte[]
                                for (int j = 0; j < ipos; j++)
                                {
                                    tempArr[j] = (sbyte)val;
                                    val        = val.TripleShift(8);
                                }
                                // point at the end index in the byte[]
                                index[doc] = (endPos << 8) | 1;
                                bytes[doc] = tempArr;
                                tempArr    = new sbyte[12];
                            }
                        }
                    }
                    SetActualDocFreq(termNum, actualDF);
                }

                termNum++;
                if (!te.MoveNext())
                {
                    break;
                }
            }

            m_numTermsInField = termNum;

            long midPoint = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results

            if (m_termInstances == 0)
            {
                // we didn't invert anything
                // lower memory consumption.
                m_tnums = null;
            }
            else
            {
                this.m_index = index;

                //
                // transform intermediate form into the final form, building a single byte[]
                // at a time, and releasing the intermediate byte[]s as we go to avoid
                // increasing the memory footprint.
                //

                for (int pass = 0; pass < 256; pass++)
                {
                    var target = m_tnums[pass];
                    var pos    = 0; // end in target;
                    if (target != null)
                    {
                        pos = target.Length;
                    }
                    else
                    {
                        target = new sbyte[4096];
                    }

                    // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
                    // where pp is the pass (which array we are building), and xx is all values.
                    // each pass shares the same byte[] for termNumber lists.
                    for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24))
                    {
                        int lim = Math.Min(docbase + (1 << 16), maxDoc);
                        for (int doc = docbase; doc < lim; doc++)
                        {
                            //System.out.println("  pass="******" process docID=" + doc);
                            int val = index[doc];
                            if ((val & 0xff) == 1)
                            {
                                int len = val.TripleShift(8);
                                //System.out.println("    ptr pos=" + pos);
                                index[doc] = (pos << 8) | 1; // change index to point to start of array
                                if ((pos & 0xff000000) != 0)
                                {
                                    // we only have 24 bits for the array index
                                    throw IllegalStateException.Create("Too many values for UnInvertedField faceting on field " + m_field);
                                }
                                var arr = bytes[doc];

                                /*
                                 * for(byte b : arr) {
                                 * //System.out.println("      b=" + Integer.toHexString((int) b));
                                 * }
                                 */
                                bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
                                if (target.Length <= pos + len)
                                {
                                    int newlen = target.Length;

                                    //* we don't have to worry about the array getting too large
                                    // since the "pos" param will overflow first (only 24 bits available)
                                    // if ((newlen<<1) <= 0) {
                                    //  // overflow...
                                    //  newlen = Integer.MAX_VALUE;
                                    //  if (newlen <= pos + len) {
                                    //    throw new SolrException(400,"Too many terms to uninvert field!");
                                    //  }
                                    // } else {
                                    //  while (newlen <= pos + len) newlen<<=1;  // doubling strategy
                                    // }
                                    //
                                    while (newlen <= pos + len) // doubling strategy
                                    {
                                        newlen <<= 1;
                                    }
                                    var newtarget = new sbyte[newlen];
                                    Array.Copy(target, 0, newtarget, 0, pos);
                                    target = newtarget;
                                }
                                Array.Copy(arr, 0, target, pos, len);
                                pos += len + 1; // skip single byte at end and leave it 0 for terminator
                            }
                        }
                    }

                    // shrink array
                    if (pos < target.Length)
                    {
                        var newtarget = new sbyte[pos];
                        Array.Copy(target, 0, newtarget, 0, pos);
                        target = newtarget;
                    }

                    m_tnums[pass] = target;

                    if ((pass << 16) > maxDoc)
                    {
                        break;
                    }
                }
            }
            if (indexedTerms != null)
            {
                m_indexedTermsArray = new BytesRef[indexedTerms.Count];
                indexedTerms.CopyTo(m_indexedTermsArray, 0);
            }

            long endTime = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results

            m_total_time  = (int)(endTime - startTime);
            m_phase1_time = (int)(midPoint - startTime);
        }