Exemplo n.º 1
0
        private GroupedFacetResult CreateExpectedFacetResult(string searchTerm, IndexContext context, int offset, int limit, int minCount, bool orderByCount, string facetPrefix)
        {
            JCG.Dictionary <string, ISet <string> > facetGroups;
            if (!context.searchTermToFacetGroups.TryGetValue(searchTerm, out facetGroups))
            {
                facetGroups = new JCG.Dictionary <string, ISet <string> >();
            }

            int           totalCount     = 0;
            int           totalMissCount = 0;
            ISet <string> facetValues;

            if (facetPrefix != null)
            {
                facetValues = new JCG.HashSet <string>();
                foreach (string facetValue in context.facetValues)
                {
                    if (facetValue != null && facetValue.StartsWith(facetPrefix, StringComparison.Ordinal))
                    {
                        facetValues.add(facetValue);
                    }
                }
            }
            else
            {
                facetValues = context.facetValues;
            }

            List <TermGroupFacetCollector.FacetEntry> entries = new List <TermGroupFacetCollector.FacetEntry>(facetGroups.size());

            // also includes facets with count 0
            foreach (string facetValue in facetValues)
            {
                if (facetValue == null)
                {
                    continue;
                }

                int count = facetGroups.TryGetValue(facetValue, out ISet <string> groups) && groups != null?groups.size() : 0;

                if (count >= minCount)
                {
                    entries.Add(new TermGroupFacetCollector.FacetEntry(new BytesRef(facetValue), count));
                }
                totalCount += count;
            }

            // Only include null count when no facet prefix is specified
            if (facetPrefix == null)
            {
                if (facetGroups.TryGetValue(null, out ISet <string> groups) && groups != null)
                {
                    totalMissCount = groups.size();
                }
            }

            entries.Sort(new ComparerAnonymousHelper2(orderByCount));

            int endOffset = offset + limit;
            List <TermGroupFacetCollector.FacetEntry> entriesResult;

            if (offset >= entries.size())
            {
                entriesResult = new List <TermGroupFacetCollector.FacetEntry>();
            }
            else if (endOffset >= entries.size())
            {
                entriesResult = entries.GetRange(offset, entries.size() - offset);
            }
            else
            {
                entriesResult = entries.GetRange(offset, endOffset - offset);
            }
            return(new GroupedFacetResult(totalCount, totalMissCount, entriesResult));
        }
Exemplo n.º 2
0
        public virtual void TestSurrogatesOrder()
        {
            Directory dir    = NewDirectory();
            var       config = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            config.Codec = new PreFlexRWCodec();
            RandomIndexWriter w = new RandomIndexWriter(Random, dir, config);

            int numField = TestUtil.NextInt32(Random, 2, 5);

            int uniqueTermCount = 0;

            int tc = 0;

            var fieldTerms = new List <Term>();

            for (int f = 0; f < numField; f++)
            {
                string field    = "f" + f;
                int    numTerms = AtLeast(200);

                ISet <string> uniqueTerms = new JCG.HashSet <string>();

                for (int i = 0; i < numTerms; i++)
                {
                    string term = GetRandomString(Random) + "_ " + (tc++);
                    uniqueTerms.Add(term);
                    fieldTerms.Add(new Term(field, term));
                    Documents.Document doc = new Documents.Document();
                    doc.Add(NewStringField(field, term, Field.Store.NO));
                    w.AddDocument(doc);
                }
                uniqueTermCount += uniqueTerms.Count;
            }

            IndexReader reader = w.GetReader();

            if (VERBOSE)
            {
                fieldTerms.Sort(TermAsUTF16Comparer);

                Console.WriteLine("\nTEST: UTF16 order");
                foreach (Term t in fieldTerms)
                {
                    Console.WriteLine("  " + ToHexString(t));
                }
            }

            // sorts in code point order:
            fieldTerms.Sort();

            if (VERBOSE)
            {
                Console.WriteLine("\nTEST: codepoint order");
                foreach (Term t in fieldTerms)
                {
                    Console.WriteLine("  " + ToHexString(t));
                }
            }

            Term[] fieldTermsArray = fieldTerms.ToArray();

            //SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms);

            //FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1));
            //Assert.IsNotNull(fields);

            DoTestStraightEnum(fieldTerms, reader, uniqueTermCount);
            DoTestSeekExists(Random, fieldTerms, reader);
            DoTestSeekDoesNotExist(Random, numField, fieldTerms, fieldTermsArray, reader);

            reader.Dispose();
            w.Dispose();
            dir.Dispose();
        }
Exemplo n.º 3
0
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString());

            try
            {
                ITermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                IOffsetAttribute            offsetAtt    = ts.AddAttribute <IOffsetAttribute>();
                IPositionLengthAttribute    posLenAtt    = ts.AddAttribute <IPositionLengthAttribute>();
                IPositionIncrementAttribute posIncAtt    = ts.AddAttribute <IPositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(gramCount <= grams);
                    }

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset);
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset);

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.GetBytesReader();

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                List <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new JCG.HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    try
                    {
                        prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef()));
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(output != null);
                            }
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(finalLastToken.Offset == 0);
                    }

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef());

                        completions = searcher.Search();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(completions.IsComplete);
                        }
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                if (Debugging.AssertsEnabled)
                                {
                                    Debugging.Assert(token.Length - i - 1 > 0);
                                }
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(),
                                                               // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
                                                               // return numbers that are greater than long.MaxValue, which results in a negative long number.
                                                               (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(results.Count == seen.Count);
                        }
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    backoff *= ALPHA;
                }

                results.Sort(Comparer <Lookup.LookupResult> .Create((a, b) =>
                {
                    if (a.Value > b.Value)
                    {
                        return(-1);
                    }
                    else if (a.Value < b.Value)
                    {
                        return(1);
                    }
                    else
                    {
                        // Tie break by UTF16 sort order:
                        return(a.Key.CompareToOrdinal(b.Key));
                    }
                }));

                if (results.Count > num)
                {
                    results.RemoveRange(num, results.Count - num); //results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Exemplo n.º 4
0
        private void AddTerms(IndexReader reader, FieldVals f)
        {
            if (f.queryString == null)
            {
                return;
            }
            Terms terms = MultiFields.GetTerms(reader, f.fieldName);

            if (terms == null)
            {
                return;
            }
            TokenStream ts = analyzer.GetTokenStream(f.fieldName, f.queryString);

            try
            {
                ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>();

                int           corpusNumDocs  = reader.NumDocs;
                ISet <string> processedTerms = new JCG.HashSet <string>();
                ts.Reset();
                while (ts.IncrementToken())
                {
                    string term = termAtt.ToString();
                    if (!processedTerms.Contains(term))
                    {
                        processedTerms.Add(term);
                        ScoreTermQueue  variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                        float           minScore  = 0;
                        Term            startTerm = new Term(f.fieldName, term);
                        AttributeSource atts      = new AttributeSource();
                        IMaxNonCompetitiveBoostAttribute maxBoostAtt =
                            atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
#pragma warning disable 612, 618
                        SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
#pragma warning restore 612, 618
                        //store the df so all variants use same idf
                        int             df                   = reader.DocFreq(startTerm);
                        int             numVariants          = 0;
                        int             totalVariantDocFreqs = 0;
                        BytesRef        possibleMatch;
                        IBoostAttribute boostAtt =
                            fe.Attributes.AddAttribute <IBoostAttribute>();
                        while (fe.MoveNext())
                        {
                            possibleMatch = fe.Term;
                            numVariants++;
                            totalVariantDocFreqs += fe.DocFreq;
                            float score = boostAtt.Boost;
                            if (variantsQ.Count < MAX_VARIANTS_PER_TERM || score > minScore)
                            {
                                ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm);
                                variantsQ.InsertWithOverflow(st);
                                minScore = variantsQ.Top.Score; // maintain minScore
                            }
                            maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Count >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity;
                        }

                        if (numVariants > 0)
                        {
                            int avgDf = totalVariantDocFreqs / numVariants;
                            if (df == 0)    //no direct match we can use as df for all variants
                            {
                                df = avgDf; //use avg df of all variants
                            }

                            // take the top variants (scored by edit distance) and reset the score
                            // to include an IDF factor then add to the global queue for ranking
                            // overall top query terms
                            int size = variantsQ.Count;
                            for (int i = 0; i < size; i++)
                            {
                                ScoreTerm st = variantsQ.Pop();
                                st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs);
                                q.InsertWithOverflow(st);
                            }
                        }
                    }
                }
                ts.End();
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Exemplo n.º 5
0
        public virtual void TestIndexing()
        {
            DirectoryInfo        tmpDir = CreateTempDir("TestNeverDelete");
            BaseDirectoryWrapper d      = NewFSDirectory(tmpDir);

            // We want to "see" files removed if Lucene removed
            // them.  this is still worth running on Windows since
            // some files the IR opens and closes.
            if (d is MockDirectoryWrapper)
            {
                ((MockDirectoryWrapper)d).NoDeleteOpenFile = false;
            }
            RandomIndexWriter w = new RandomIndexWriter(Random, d, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetIndexDeletionPolicy(NoDeletionPolicy.INSTANCE));

            w.IndexWriter.Config.SetMaxBufferedDocs(TestUtil.NextInt32(Random, 5, 30));

            w.Commit();
            ThreadJob[] indexThreads = new ThreadJob[Random.Next(4)];
            long        stopTime     = Environment.TickCount + AtLeast(1000);

            for (int x = 0; x < indexThreads.Length; x++)
            {
                indexThreads[x]      = new ThreadAnonymousInnerClassHelper(w, stopTime, NewStringField, NewTextField);
                indexThreads[x].Name = "Thread " + x;
                indexThreads[x].Start();
            }

            ISet <string> allFiles = new JCG.HashSet <string>();

            DirectoryReader r = DirectoryReader.Open(d);

            while (Environment.TickCount < stopTime)
            {
                IndexCommit ic = r.IndexCommit;
                if (Verbose)
                {
                    Console.WriteLine("TEST: check files: " + ic.FileNames);
                }
                allFiles.UnionWith(ic.FileNames);
                // Make sure no old files were removed
                foreach (string fileName in allFiles)
                {
                    Assert.IsTrue(SlowFileExists(d, fileName), "file " + fileName + " does not exist");
                }
                DirectoryReader r2 = DirectoryReader.OpenIfChanged(r);
                if (r2 != null)
                {
                    r.Dispose();
                    r = r2;
                }
                Thread.Sleep(1);
            }
            r.Dispose();

            foreach (ThreadJob t in indexThreads)
            {
                t.Join();
            }
            w.Dispose();
            d.Dispose();

            System.IO.Directory.Delete(tmpDir.FullName, true);
        }
        private void CreateRandomIndexes()
        {
            dir1 = NewDirectory();
            dir2 = NewDirectory();
            int           numDocs     = AtLeast(150);
            int           numTerms    = TestUtil.NextInt32(Random, 1, numDocs / 5);
            ISet <string> randomTerms = new JCG.HashSet <string>();

            while (randomTerms.size() < numTerms)
            {
                randomTerms.add(TestUtil.RandomSimpleString(Random));
            }
            terms = new List <string>(randomTerms);
            long seed = Random.NextInt64();
            IndexWriterConfig iwc1 = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new Random((int)seed)));
            IndexWriterConfig iwc2 = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new Random((int)seed)));

            iwc2.SetMergePolicy(NewSortingMergePolicy(sort));
            RandomIndexWriter iw1 = new RandomIndexWriter(new Random((int)seed), dir1, iwc1);
            RandomIndexWriter iw2 = new RandomIndexWriter(new Random((int)seed), dir2, iwc2);

            for (int i = 0; i < numDocs; ++i)
            {
                if (Random.nextInt(5) == 0 && i != numDocs - 1)
                {
                    string term = RandomPicks.RandomFrom(Random, terms);
                    iw1.DeleteDocuments(new Term("s", term));
                    iw2.DeleteDocuments(new Term("s", term));
                }
                Document doc = randomDocument();
                iw1.AddDocument(doc);
                iw2.AddDocument(doc);
                if (Random.nextInt(8) == 0)
                {
                    iw1.Commit();
                    iw2.Commit();
                }
            }
            // Make sure we have something to merge
            iw1.Commit();
            iw2.Commit();
            Document doc2 = randomDocument();

            // NOTE: don't use RIW.addDocument directly, since it sometimes commits
            // which may trigger a merge, at which case forceMerge may not do anything.
            // With field updates this is a problem, since the updates can go into the
            // single segment in the index, and threefore the index won't be sorted.
            // This hurts the assumption of the test later on, that the index is sorted
            // by SortingMP.
            iw1.IndexWriter.AddDocument(doc2);
            iw2.IndexWriter.AddDocument(doc2);

            if (DefaultCodecSupportsFieldUpdates)
            {
                // update NDV of docs belonging to one term (covers many documents)
                long   value = Random.NextInt64();
                string term  = RandomPicks.RandomFrom(Random, terms);
                iw1.IndexWriter.UpdateNumericDocValue(new Term("s", term), "ndv", value);
                iw2.IndexWriter.UpdateNumericDocValue(new Term("s", term), "ndv", value);
            }

            iw1.ForceMerge(1);
            iw2.ForceMerge(1);
            iw1.Dispose();
            iw2.Dispose();
            reader       = DirectoryReader.Open(dir1);
            sortedReader = DirectoryReader.Open(dir2);
        }
Exemplo n.º 7
0
        static TestRandomChains()
        {
            try
            {
                brokenConstructors[typeof(LimitTokenCountFilter).GetConstructor(new Type[] { typeof(TokenStream), typeof(int) })] = ALWAYS;
                brokenConstructors[typeof(LimitTokenCountFilter).GetConstructor(new Type[] { typeof(TokenStream), typeof(int), typeof(bool) })]    = new PredicateAnonymousInnerClassHelper2();
                brokenConstructors[typeof(LimitTokenPositionFilter).GetConstructor(new Type[] { typeof(TokenStream), typeof(int) })]               = ALWAYS;
                brokenConstructors[typeof(LimitTokenPositionFilter).GetConstructor(new Type[] { typeof(TokenStream), typeof(int), typeof(bool) })] = new PredicateAnonymousInnerClassHelper3();
                foreach (Type c in new Type[] {
                    // TODO: can we promote some of these to be only
                    // offsets offenders?
                    // doesn't actual reset itself:
                    typeof(CachingTokenFilter),
                    // Not broken, simulates brokenness:
                    typeof(CrankyTokenFilter),
                    // Not broken: we forcefully add this, so we shouldn't
                    // also randomly pick it:
                    typeof(ValidatingTokenFilter)
                })
                {
                    foreach (ConstructorInfo ctor in c.GetConstructors())
                    {
                        brokenConstructors[ctor] = ALWAYS;
                    }
                }
            }
            catch (Exception e)
            {
                throw new Exception(e.Message, e);
            }
            try
            {
                foreach (Type c in new Type[] {
                    typeof(ReversePathHierarchyTokenizer),
                    typeof(PathHierarchyTokenizer),
                    // TODO: it seems to mess up offsets!?
                    typeof(WikipediaTokenizer),
                    // TODO: doesn't handle graph inputs
                    typeof(CJKBigramFilter),
                    // TODO: doesn't handle graph inputs (or even look at positionIncrement)
                    typeof(HyphenatedWordsFilter),
                    // TODO: LUCENE-4983
                    typeof(CommonGramsFilter),
                    // TODO: doesn't handle graph inputs
                    typeof(CommonGramsQueryFilter),
                    // TODO: probably doesnt handle graph inputs, too afraid to try
                    typeof(WordDelimiterFilter)
                })
                {
                    foreach (ConstructorInfo ctor in c.GetConstructors())
                    {
                        brokenOffsetsConstructors[ctor] = ALWAYS;
                    }
                }
            }
            catch (Exception e)
            {
                throw new Exception(e.Message, e);
            }

            allowedTokenizerArgs = new JCG.HashSet <Type>(IdentityEqualityComparer <Type> .Default);
            allowedTokenizerArgs.addAll(argProducers.Keys);
            allowedTokenizerArgs.Add(typeof(TextReader));
            allowedTokenizerArgs.Add(typeof(AttributeSource.AttributeFactory));
            allowedTokenizerArgs.Add(typeof(AttributeSource));

            allowedTokenFilterArgs = new JCG.HashSet <Type>(IdentityEqualityComparer <Type> .Default);
            allowedTokenFilterArgs.addAll(argProducers.Keys);
            allowedTokenFilterArgs.Add(typeof(TokenStream));
            // TODO: fix this one, thats broken:
            allowedTokenFilterArgs.Add(typeof(CommonGramsFilter));

            allowedCharFilterArgs = new JCG.HashSet <Type>(IdentityEqualityComparer <Type> .Default);
            allowedCharFilterArgs.addAll(argProducers.Keys);
            allowedCharFilterArgs.Add(typeof(TextReader));
        }
Exemplo n.º 8
0
        public virtual void TestStressAdvance_Mem()
        {
            for (int iter = 0; iter < 3; iter++)
            {
                if (Verbose)
                {
                    Console.WriteLine("\nTEST: iter=" + iter);
                }
                Directory         dir = NewDirectory();
                RandomIndexWriter w   = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                    this,
#endif
                    Random, dir);
                ISet <int>         aDocs = new JCG.HashSet <int>();
                Documents.Document doc   = new Documents.Document();
                Field f = NewStringField("field", "", Field.Store.NO);
                doc.Add(f);
                Field idField = NewStringField("id", "", Field.Store.YES);
                doc.Add(idField);
                int num = AtLeast(4097);
                if (Verbose)
                {
                    Console.WriteLine("\nTEST: numDocs=" + num);
                }
                for (int id = 0; id < num; id++)
                {
                    if (Random.Next(4) == 3)
                    {
                        f.SetStringValue("a");
                        aDocs.Add(id);
                    }
                    else
                    {
                        f.SetStringValue("b");
                    }
                    idField.SetStringValue("" + id);
                    w.AddDocument(doc);
                    if (Verbose)
                    {
                        Console.WriteLine("\nTEST: doc upto " + id);
                    }
                }

                w.ForceMerge(1);

                IList <int> aDocIDs = new JCG.List <int>();
                IList <int> bDocIDs = new JCG.List <int>();

                DirectoryReader r         = w.GetReader();
                int[]           idToDocID = new int[r.MaxDoc];
                for (int docID = 0; docID < idToDocID.Length; docID++)
                {
                    int id = Convert.ToInt32(r.Document(docID).Get("id"));
                    if (aDocs.Contains(id))
                    {
                        aDocIDs.Add(docID);
                    }
                    else
                    {
                        bDocIDs.Add(docID);
                    }
                }
                TermsEnum te = GetOnlySegmentReader(r).Fields.GetTerms("field").GetEnumerator();

                DocsEnum de = null;
                for (int iter2 = 0; iter2 < 10; iter2++)
                {
                    if (Verbose)
                    {
                        Console.WriteLine("\nTEST: iter=" + iter + " iter2=" + iter2);
                    }
                    Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(new BytesRef("a")));
                    de = TestUtil.Docs(Random, te, null, de, DocsFlags.NONE);
                    TestOne(de, aDocIDs);

                    Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(new BytesRef("b")));
                    de = TestUtil.Docs(Random, te, null, de, DocsFlags.NONE);
                    TestOne(de, bDocIDs);
                }

                w.Dispose();
                r.Dispose();
                dir.Dispose();
            }
        }
Exemplo n.º 9
0
        public void TestTerms()
        {
            Random random = Random;
            int    num    = AtLeast(10000);

#pragma warning disable 612, 618
            IComparer <BytesRef> comparer = random.nextBoolean() ? BytesRef.UTF8SortedAsUnicodeComparer : BytesRef.UTF8SortedAsUTF16Comparer;
#pragma warning restore 612, 618
            IDictionary <BytesRef, KeyValuePair <long, BytesRef> > sorted = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, BytesRef> >(comparer);
            IDictionary <BytesRef, long> sortedWithoutPayload             = new JCG.SortedDictionary <BytesRef, long>(comparer);
            IDictionary <BytesRef, KeyValuePair <long, ISet <BytesRef> > > sortedWithContext = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, ISet <BytesRef> > >(comparer);
            IDictionary <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > > sortedWithPayloadAndContext = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > >(comparer);
            Input[]         unsorted = new Input[num];
            Input[]         unsortedWithoutPayload        = new Input[num];
            Input[]         unsortedWithContexts          = new Input[num];
            Input[]         unsortedWithPayloadAndContext = new Input[num];
            ISet <BytesRef> ctxs;
            for (int i = 0; i < num; i++)
            {
                BytesRef key2;
                BytesRef payload;
                ctxs = new JCG.HashSet <BytesRef>();
                do
                {
                    key2    = new BytesRef(TestUtil.RandomUnicodeString(random));
                    payload = new BytesRef(TestUtil.RandomUnicodeString(random));
                    for (int j = 0; j < AtLeast(2); j++)
                    {
                        ctxs.add(new BytesRef(TestUtil.RandomUnicodeString(random)));
                    }
                } while (sorted.ContainsKey(key2));
                long value = random.Next();
                sortedWithoutPayload.Put(key2, value);
                sorted.Put(key2, new KeyValuePair <long, BytesRef>(value, payload));
                sortedWithContext.Put(key2, new KeyValuePair <long, ISet <BytesRef> >(value, ctxs));
                sortedWithPayloadAndContext.Put(key2, new KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > >(value, new KeyValuePair <BytesRef, ISet <BytesRef> >(payload, ctxs)));
                unsorted[i] = new Input(key2, value, payload);
                unsortedWithoutPayload[i]        = new Input(key2, value);
                unsortedWithContexts[i]          = new Input(key2, value, ctxs);
                unsortedWithPayloadAndContext[i] = new Input(key2, value, payload, ctxs);
            }

            // test the sorted iterator wrapper with payloads
            IInputIterator wrapper = new SortedInputIterator(new InputArrayIterator(unsorted), comparer);
            IEnumerator <KeyValuePair <BytesRef, KeyValuePair <long, BytesRef> > > expected = sorted.GetEnumerator();
            while (expected.MoveNext())
            {
                KeyValuePair <BytesRef, KeyValuePair <long, BytesRef> > entry = expected.Current;


                assertEquals(entry.Key, wrapper.Next());
                assertEquals(Convert.ToInt64(entry.Value.Key), wrapper.Weight);
                assertEquals(entry.Value.Value, wrapper.Payload);
            }
            assertNull(wrapper.Next());

            // test the sorted iterator wrapper with contexts
            wrapper = new SortedInputIterator(new InputArrayIterator(unsortedWithContexts), comparer);
            IEnumerator <KeyValuePair <BytesRef, KeyValuePair <long, ISet <BytesRef> > > > actualEntries = sortedWithContext.GetEnumerator();
            while (actualEntries.MoveNext())
            {
                KeyValuePair <BytesRef, KeyValuePair <long, ISet <BytesRef> > > entry = actualEntries.Current;
                assertEquals(entry.Key, wrapper.Next());
                assertEquals(Convert.ToInt64(entry.Value.Key), wrapper.Weight);
                ISet <BytesRef> actualCtxs = entry.Value.Value;
                assertEquals(actualCtxs, wrapper.Contexts);
            }
            assertNull(wrapper.Next());

            // test the sorted iterator wrapper with contexts and payload
            wrapper = new SortedInputIterator(new InputArrayIterator(unsortedWithPayloadAndContext), comparer);
            IEnumerator <KeyValuePair <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > > > expectedPayloadContextEntries = sortedWithPayloadAndContext.GetEnumerator();
            while (expectedPayloadContextEntries.MoveNext())
            {
                KeyValuePair <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > > entry = expectedPayloadContextEntries.Current;
                assertEquals(entry.Key, wrapper.Next());
                assertEquals(Convert.ToInt64(entry.Value.Key), wrapper.Weight);
                ISet <BytesRef> actualCtxs = entry.Value.Value.Value;
                assertEquals(actualCtxs, wrapper.Contexts);
                BytesRef actualPayload = entry.Value.Value.Key;
                assertEquals(actualPayload, wrapper.Payload);
            }
            assertNull(wrapper.Next());

            // test the unsorted iterator wrapper with payloads
            wrapper = new UnsortedInputIterator(new InputArrayIterator(unsorted));
            IDictionary <BytesRef, KeyValuePair <long, BytesRef> > actual = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, BytesRef> >();
            BytesRef key;
            while ((key = wrapper.Next()) != null)
            {
                long     value   = wrapper.Weight;
                BytesRef payload = wrapper.Payload;
                actual.Put(BytesRef.DeepCopyOf(key), new KeyValuePair <long, BytesRef>(value, BytesRef.DeepCopyOf(payload)));
            }
            assertEquals(sorted, actual, aggressive: false);

            // test the sorted iterator wrapper without payloads
            IInputIterator wrapperWithoutPayload = new SortedInputIterator(new InputArrayIterator(unsortedWithoutPayload), comparer);
            IEnumerator <KeyValuePair <BytesRef, long> > expectedWithoutPayload = sortedWithoutPayload.GetEnumerator();
            while (expectedWithoutPayload.MoveNext())
            {
                KeyValuePair <BytesRef, long> entry = expectedWithoutPayload.Current;


                assertEquals(entry.Key, wrapperWithoutPayload.Next());
                assertEquals(Convert.ToInt64(entry.Value), wrapperWithoutPayload.Weight);
                assertNull(wrapperWithoutPayload.Payload);
            }
            assertNull(wrapperWithoutPayload.Next());

            // test the unsorted iterator wrapper without payloads
            wrapperWithoutPayload = new UnsortedInputIterator(new InputArrayIterator(unsortedWithoutPayload));
            IDictionary <BytesRef, long> actualWithoutPayload = new JCG.SortedDictionary <BytesRef, long>();
            while ((key = wrapperWithoutPayload.Next()) != null)
            {
                long value = wrapperWithoutPayload.Weight;
                assertNull(wrapperWithoutPayload.Payload);
                actualWithoutPayload.Put(BytesRef.DeepCopyOf(key), value);
            }
            assertEquals(sortedWithoutPayload, actualWithoutPayload, aggressive: false);
        }
Exemplo n.º 10
0
        public virtual void TestRandomWithPrefix()
        {
            Directory dir = NewDirectory();

            ISet <string> prefixes  = new JCG.HashSet <string>();
            int           numPrefix = TestUtil.NextInt32(Random, 2, 7);

            if (Verbose)
            {
                Console.WriteLine("TEST: use " + numPrefix + " prefixes");
            }
            while (prefixes.Count < numPrefix)
            {
                prefixes.Add(TestUtil.RandomRealisticUnicodeString(Random));
                //prefixes.Add(TestUtil.RandomSimpleString(random));
            }
            string[] prefixesArray = prefixes.ToArray(/*new string[prefixes.Count]*/);

            int             NUM_TERMS = AtLeast(20);
            ISet <BytesRef> terms     = new JCG.HashSet <BytesRef>();

            while (terms.Count < NUM_TERMS)
            {
                string s = prefixesArray[Random.Next(prefixesArray.Length)] + TestUtil.RandomRealisticUnicodeString(Random);
                //final String s = prefixesArray[random.nextInt(prefixesArray.Length)] + TestUtil.RandomSimpleString(random);
                if (s.Length > 0)
                {
                    terms.Add(new BytesRef(s));
                }
            }
            BytesRef[] termsArray = terms.ToArray();
            Array.Sort(termsArray);

            int NUM_DOCS = AtLeast(100);

            IndexWriterConfig conf = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            // Sometimes swap in codec that impls ord():
            if (Random.Next(10) == 7)
            {
                Codec codec = TestUtil.AlwaysPostingsFormat(PostingsFormat.ForName("Lucene41WithOrds"));
                conf.SetCodec(codec);
            }

            RandomIndexWriter w = new RandomIndexWriter(Random, dir, conf);

            int[][]     idToOrds      = new int[NUM_DOCS][];
            ISet <int?> ordsForDocSet = new JCG.HashSet <int?>();

            for (int id = 0; id < NUM_DOCS; id++)
            {
                Document doc = new Document();

                doc.Add(new Int32Field("id", id, Field.Store.NO));

                int termCount = TestUtil.NextInt32(Random, 0, 20 * RandomMultiplier);
                while (ordsForDocSet.Count < termCount)
                {
                    ordsForDocSet.Add(Random.Next(termsArray.Length));
                }
                int[] ordsForDoc = new int[termCount];
                int   upto       = 0;
                if (Verbose)
                {
                    Console.WriteLine("TEST: doc id=" + id);
                }
                foreach (int ord in ordsForDocSet)
                {
                    ordsForDoc[upto++] = ord;
                    Field field = NewStringField("field", termsArray[ord].Utf8ToString(), Field.Store.NO);
                    if (Verbose)
                    {
                        Console.WriteLine("  f=" + termsArray[ord].Utf8ToString());
                    }
                    doc.Add(field);
                }
                ordsForDocSet.Clear();
                Array.Sort(ordsForDoc);
                idToOrds[id] = ordsForDoc;
                w.AddDocument(doc);
            }

            DirectoryReader r = w.GetReader();

            w.Dispose();

            if (Verbose)
            {
                Console.WriteLine("TEST: reader=" + r);
            }

            AtomicReader slowR = SlowCompositeReaderWrapper.Wrap(r);

            foreach (string prefix in prefixesArray)
            {
                BytesRef prefixRef = prefix == null ? null : new BytesRef(prefix);

                int[][] idToOrdsPrefix = new int[NUM_DOCS][];
                for (int id = 0; id < NUM_DOCS; id++)
                {
                    int[]        docOrds = idToOrds[id];
                    IList <int?> newOrds = new List <int?>();
                    foreach (int ord in idToOrds[id])
                    {
                        if (StringHelper.StartsWith(termsArray[ord], prefixRef))
                        {
                            newOrds.Add(ord);
                        }
                    }
                    int[] newOrdsArray = new int[newOrds.Count];
                    int   upto         = 0;
                    foreach (int ord in newOrds)
                    {
                        newOrdsArray[upto++] = ord;
                    }
                    idToOrdsPrefix[id] = newOrdsArray;
                }

                foreach (AtomicReaderContext ctx in r.Leaves)
                {
                    if (Verbose)
                    {
                        Console.WriteLine("\nTEST: sub=" + ctx.Reader);
                    }
                    Verify((AtomicReader)ctx.Reader, idToOrdsPrefix, termsArray, prefixRef);
                }

                // Also test top-level reader: its enum does not support
                // ord, so this forces the OrdWrapper to run:
                if (Verbose)
                {
                    Console.WriteLine("TEST: top reader");
                }
                Verify(slowR, idToOrdsPrefix, termsArray, prefixRef);
            }

            FieldCache.DEFAULT.PurgeByCacheKey(slowR.CoreCacheKey);

            r.Dispose();
            dir.Dispose();
        }
Exemplo n.º 11
0
        private IndexIterationContext CreateContext(int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter,
                                                    bool multipleValuesPerDocument, bool scoreDocsInOrder)
        {
            IndexIterationContext context = new IndexIterationContext();
            int numRandomValues           = nDocs / 2;

            context.RandomUniqueValues = new string[numRandomValues];
            ISet <string> trackSet = new JCG.HashSet <string>();

            context.RandomFrom = new bool[numRandomValues];
            for (int i = 0; i < numRandomValues; i++)
            {
                string uniqueRandomValue;
                do
                {
                    uniqueRandomValue = TestUtil.RandomRealisticUnicodeString(Random);
                    //        uniqueRandomValue = TestUtil.randomSimpleString(random);
                } while ("".Equals(uniqueRandomValue, StringComparison.Ordinal) || trackSet.Contains(uniqueRandomValue));
                // Generate unique values and empty strings aren't allowed.
                trackSet.Add(uniqueRandomValue);
                context.RandomFrom[i]         = Random.NextBoolean();
                context.RandomUniqueValues[i] = uniqueRandomValue;
            }

            RandomDoc[] docs = new RandomDoc[nDocs];
            for (int i = 0; i < nDocs; i++)
            {
                string   id       = Convert.ToString(i);
                int      randomI  = Random.Next(context.RandomUniqueValues.Length);
                string   value    = context.RandomUniqueValues[randomI];
                Document document = new Document();
                document.Add(NewTextField(Random, "id", id, Field.Store.NO));
                document.Add(NewTextField(Random, "value", value, Field.Store.NO));

                bool from = context.RandomFrom[randomI];
                int  numberOfLinkValues = multipleValuesPerDocument ? 2 + Random.Next(10) : 1;
                docs[i] = new RandomDoc(id, numberOfLinkValues, value, from);
                for (int j = 0; j < numberOfLinkValues; j++)
                {
                    string linkValue = context.RandomUniqueValues[Random.Next(context.RandomUniqueValues.Length)];
                    docs[i].linkValues.Add(linkValue);
                    if (from)
                    {
                        if (!context.FromDocuments.TryGetValue(linkValue, out IList <RandomDoc> fromDocs))
                        {
                            context.FromDocuments[linkValue] = fromDocs = new List <RandomDoc>();
                        }
                        if (!context.RandomValueFromDocs.TryGetValue(value, out IList <RandomDoc> randomValueFromDocs))
                        {
                            context.RandomValueFromDocs[value] = randomValueFromDocs = new List <RandomDoc>();
                        }

                        fromDocs.Add(docs[i]);
                        randomValueFromDocs.Add(docs[i]);
                        document.Add(NewTextField(Random, "from", linkValue, Field.Store.NO));
                    }
                    else
                    {
                        if (!context.ToDocuments.TryGetValue(linkValue, out IList <RandomDoc> toDocuments))
                        {
                            context.ToDocuments[linkValue] = toDocuments = new List <RandomDoc>();
                        }
                        if (!context.RandomValueToDocs.TryGetValue(value, out IList <RandomDoc> randomValueToDocs))
                        {
                            context.RandomValueToDocs[value] = randomValueToDocs = new List <RandomDoc>();
                        }

                        toDocuments.Add(docs[i]);
                        randomValueToDocs.Add(docs[i]);
                        document.Add(NewTextField(Random, "to", linkValue, Field.Store.NO));
                    }
                }

                RandomIndexWriter w;
                if (from)
                {
                    w = fromWriter;
                }
                else
                {
                    w = toWriter;
                }

                w.AddDocument(document);
                if (Random.Next(10) == 4)
                {
                    w.Commit();
                }
                if (Verbose)
                {
                    Console.WriteLine("Added document[" + docs[i].id + "]: " + document);
                }
            }

            // Pre-compute all possible hits for all unique random values. On top of this also compute all possible score for
            // any ScoreMode.
            IndexSearcher fromSearcher = NewSearcher(fromWriter.GetReader());
            IndexSearcher toSearcher   = NewSearcher(toWriter.GetReader());

            for (int i = 0; i < context.RandomUniqueValues.Length; i++)
            {
                string uniqueRandomValue = context.RandomUniqueValues[i];
                string fromField;
                string toField;
                IDictionary <string, IDictionary <int, JoinScore> > queryVals;
                if (context.RandomFrom[i])
                {
                    fromField = "from";
                    toField   = "to";
                    queryVals = context.FromHitsToJoinScore;
                }
                else
                {
                    fromField = "to";
                    toField   = "from";
                    queryVals = context.ToHitsToJoinScore;
                }
                IDictionary <BytesRef, JoinScore> joinValueToJoinScores = new Dictionary <BytesRef, JoinScore>();
                if (multipleValuesPerDocument)
                {
                    fromSearcher.Search(new TermQuery(new Term("value", uniqueRandomValue)),
                                        new CollectorAnonymousInnerClassHelper3(this, context, fromField, joinValueToJoinScores));
                }
                else
                {
                    fromSearcher.Search(new TermQuery(new Term("value", uniqueRandomValue)),
                                        new CollectorAnonymousInnerClassHelper4(this, context, fromField, joinValueToJoinScores));
                }

                IDictionary <int, JoinScore> docToJoinScore = new Dictionary <int, JoinScore>();
                if (multipleValuesPerDocument)
                {
                    if (scoreDocsInOrder)
                    {
                        AtomicReader slowCompositeReader = SlowCompositeReaderWrapper.Wrap(toSearcher.IndexReader);
                        Terms        terms = slowCompositeReader.GetTerms(toField);
                        if (terms != null)
                        {
                            DocsEnum  docsEnum  = null;
                            TermsEnum termsEnum = null;
                            JCG.SortedSet <BytesRef> joinValues =
                                new JCG.SortedSet <BytesRef>(BytesRef.UTF8SortedAsUnicodeComparer);
                            joinValues.UnionWith(joinValueToJoinScores.Keys);
                            foreach (BytesRef joinValue in joinValues)
                            {
                                termsEnum = terms.GetIterator(termsEnum);
                                if (termsEnum.SeekExact(joinValue))
                                {
                                    docsEnum = termsEnum.Docs(slowCompositeReader.LiveDocs, docsEnum, DocsFlags.NONE);
                                    JoinScore joinScore = joinValueToJoinScores[joinValue];

                                    for (int doc = docsEnum.NextDoc();
                                         doc != DocIdSetIterator.NO_MORE_DOCS;
                                         doc = docsEnum.NextDoc())
                                    {
                                        // First encountered join value determines the score.
                                        // Something to keep in mind for many-to-many relations.
                                        if (!docToJoinScore.ContainsKey(doc))
                                        {
                                            docToJoinScore[doc] = joinScore;
                                        }
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        toSearcher.Search(new MatchAllDocsQuery(),
                                          new CollectorAnonymousInnerClassHelper5(this, context, toField, joinValueToJoinScores,
                                                                                  docToJoinScore));
                    }
                }
                else
                {
                    toSearcher.Search(new MatchAllDocsQuery(),
                                      new CollectorAnonymousInnerClassHelper6(this, toField, joinValueToJoinScores,
                                                                              docToJoinScore));
                }
                queryVals[uniqueRandomValue] = docToJoinScore;
            }

            fromSearcher.IndexReader.Dispose();
            toSearcher.IndexReader.Dispose();

            return(context);
        }
Exemplo n.º 12
0
        public virtual void TestDefaults()
        {
            IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));
            Assert.AreEqual(typeof(MockAnalyzer), conf.Analyzer.GetType());
            Assert.IsNull(conf.IndexCommit);
            Assert.AreEqual(typeof(KeepOnlyLastCommitDeletionPolicy), conf.IndexDeletionPolicy.GetType());
#if !FEATURE_CONCURRENTMERGESCHEDULER
            Assert.AreEqual(typeof(TaskMergeScheduler), conf.MergeScheduler.GetType());
#else
            Assert.AreEqual(typeof(ConcurrentMergeScheduler), conf.MergeScheduler.GetType());
#endif
            Assert.AreEqual(OpenMode.CREATE_OR_APPEND, conf.OpenMode);
            // we don't need to assert this, it should be unspecified
            Assert.IsTrue(IndexSearcher.DefaultSimilarity == conf.Similarity);
            Assert.AreEqual(IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL, conf.TermIndexInterval);
            Assert.AreEqual(IndexWriterConfig.DefaultWriteLockTimeout, conf.WriteLockTimeout);
            Assert.AreEqual(IndexWriterConfig.WRITE_LOCK_TIMEOUT, IndexWriterConfig.DefaultWriteLockTimeout);
            Assert.AreEqual(IndexWriterConfig.DEFAULT_MAX_BUFFERED_DELETE_TERMS, conf.MaxBufferedDeleteTerms);
            Assert.AreEqual(IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB, conf.RAMBufferSizeMB, 0.0);
            Assert.AreEqual(IndexWriterConfig.DEFAULT_MAX_BUFFERED_DOCS, conf.MaxBufferedDocs);
            Assert.AreEqual(IndexWriterConfig.DEFAULT_READER_POOLING, conf.UseReaderPooling);
            Assert.IsTrue(DocumentsWriterPerThread.DefaultIndexingChain == conf.IndexingChain);
            Assert.IsNull(conf.MergedSegmentWarmer);
            Assert.AreEqual(IndexWriterConfig.DEFAULT_READER_TERMS_INDEX_DIVISOR, conf.ReaderTermsIndexDivisor);
            Assert.AreEqual(typeof(TieredMergePolicy), conf.MergePolicy.GetType());
            Assert.AreEqual(typeof(DocumentsWriterPerThreadPool), conf.IndexerThreadPool.GetType());
            Assert.AreEqual(typeof(FlushByRamOrCountsPolicy), conf.FlushPolicy.GetType());
            Assert.AreEqual(IndexWriterConfig.DEFAULT_RAM_PER_THREAD_HARD_LIMIT_MB, conf.RAMPerThreadHardLimitMB);
            Assert.AreEqual(Codec.Default, conf.Codec);
            Assert.AreEqual((object)InfoStream.Default, conf.InfoStream);
            Assert.AreEqual(IndexWriterConfig.DEFAULT_USE_COMPOUND_FILE_SYSTEM, conf.UseCompoundFile);
            // Sanity check - validate that all getters are covered.
            ISet<string> getters = new JCG.HashSet<string>();
            getters.Add("getAnalyzer");
            getters.Add("getIndexCommit");
            getters.Add("getIndexDeletionPolicy");
            getters.Add("getMaxFieldLength");
            getters.Add("getMergeScheduler");
            getters.Add("getOpenMode");
            getters.Add("getSimilarity");
            getters.Add("getTermIndexInterval");
            getters.Add("getWriteLockTimeout");
            getters.Add("getDefaultWriteLockTimeout");
            getters.Add("getMaxBufferedDeleteTerms");
            getters.Add("getRAMBufferSizeMB");
            getters.Add("getMaxBufferedDocs");
            getters.Add("getIndexingChain");
            getters.Add("getMergedSegmentWarmer");
            getters.Add("getMergePolicy");
            getters.Add("getMaxThreadStates");
            getters.Add("getReaderPooling");
            getters.Add("getIndexerThreadPool");
            getters.Add("getReaderTermsIndexDivisor");
            getters.Add("getFlushPolicy");
            getters.Add("getRAMPerThreadHardLimitMB");
            getters.Add("getCodec");
            getters.Add("getInfoStream");
            getters.Add("getUseCompoundFile");

            foreach (MethodInfo m in typeof(IndexWriterConfig).GetMethods())
            {
                if (m.DeclaringType == typeof(IndexWriterConfig) && m.Name.StartsWith("get", StringComparison.Ordinal) && !m.Name.StartsWith("get_", StringComparison.Ordinal))
                {
                    Assert.IsTrue(getters.Contains(m.Name), "method " + m.Name + " is not tested for defaults");
                }
            }
        }
Exemplo n.º 13
0
        /// <summary>
        /// Tests a CacheEntry[] for indication of "insane" cache usage.
        /// <para>
        /// <b>NOTE:</b>FieldCache CreationPlaceholder objects are ignored.
        /// (:TODO: is this a bad idea? are we masking a real problem?)
        /// </para>
        /// </summary>
        public Insanity[] Check(params FieldCache.CacheEntry[] cacheEntries)
        {
            if (null == cacheEntries || 0 == cacheEntries.Length)
            {
                return(Arrays.Empty <Insanity>());
            }

            if (estimateRam)
            {
                for (int i = 0; i < cacheEntries.Length; i++)
                {
                    cacheEntries[i].EstimateSize();
                }
            }

            // the indirect mapping lets MapOfSet dedup identical valIds for us
            // maps the (valId) identityhashCode of cache values to
            // sets of CacheEntry instances
            MapOfSets <int, FieldCache.CacheEntry> valIdToItems = new MapOfSets <int, FieldCache.CacheEntry>(new Dictionary <int, ISet <FieldCache.CacheEntry> >(17));
            // maps ReaderField keys to Sets of ValueIds
            MapOfSets <ReaderField, int> readerFieldToValIds = new MapOfSets <ReaderField, int>(new Dictionary <ReaderField, ISet <int> >(17));

            // any keys that we know result in more then one valId
            ISet <ReaderField> valMismatchKeys = new JCG.HashSet <ReaderField>();

            // iterate over all the cacheEntries to get the mappings we'll need
            for (int i = 0; i < cacheEntries.Length; i++)
            {
                FieldCache.CacheEntry item = cacheEntries[i];
                object val = item.Value;

                // It's OK to have dup entries, where one is eg
                // float[] and the other is the Bits (from
                // getDocWithField())
                if (val is IBits)
                {
                    continue;
                }

                if (val is FieldCache.ICreationPlaceholder)
                {
                    continue;
                }

                ReaderField rf = new ReaderField(item.ReaderKey, item.FieldName);

                int valId = RuntimeHelpers.GetHashCode(val);

                // indirect mapping, so the MapOfSet will dedup identical valIds for us
                valIdToItems.Put(valId, item);
                if (1 < readerFieldToValIds.Put(rf, valId))
                {
                    valMismatchKeys.Add(rf);
                }
            }

            List <Insanity> insanity = new List <Insanity>(valMismatchKeys.Count * 3);

            insanity.AddRange(CheckValueMismatch(valIdToItems, readerFieldToValIds, valMismatchKeys));
            insanity.AddRange(CheckSubreaders(valIdToItems, readerFieldToValIds));

            return(insanity.ToArray());
        }
        /// <summary>
        /// Fills a <see cref="T:IDictionary{string, WeightedSpanTerm}"/> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <see cref="SpanQuery"/>.
        /// </summary>
        /// <param name="terms"><see cref="T:IDictionary{string, WeightedSpanTerm}"/> to place created <see cref="WeightedSpanTerm"/>s in</param>
        /// <param name="spanQuery"><see cref="SpanQuery"/> to extract Terms from</param>
        /// <exception cref="IOException">If there is a low-level I/O error</exception>
        protected virtual void ExtractWeightedSpanTerms(IDictionary <string, WeightedSpanTerm> terms, SpanQuery spanQuery)
        {
            ISet <string> fieldNames;

            if (fieldName == null)
            {
                fieldNames = new JCG.HashSet <string>();
                CollectSpanQueryFields(spanQuery, fieldNames);
            }
            else
            {
                fieldNames = new JCG.HashSet <string>
                {
                    fieldName
                };
            }
            // To support the use of the default field name
            if (defaultField != null)
            {
                fieldNames.Add(defaultField);
            }

            IDictionary <string, SpanQuery> queries = new JCG.Dictionary <string, SpanQuery>();

            var  nonWeightedTerms = new JCG.HashSet <Term>();
            bool mustRewriteQuery = MustRewriteQuery(spanQuery);

            if (mustRewriteQuery)
            {
                foreach (string field in fieldNames)
                {
                    SpanQuery rewrittenQuery = (SpanQuery)spanQuery.Rewrite(GetLeafContext().Reader);
                    queries[field] = rewrittenQuery;
                    rewrittenQuery.ExtractTerms(nonWeightedTerms);
                }
            }
            else
            {
                spanQuery.ExtractTerms(nonWeightedTerms);
            }

            List <PositionSpan> spanPositions = new List <PositionSpan>();

            foreach (string field in fieldNames)
            {
                SpanQuery q;
                q = mustRewriteQuery ? queries[field] : spanQuery;

                AtomicReaderContext context = GetLeafContext();
                var         termContexts    = new JCG.Dictionary <Term, TermContext>();
                ISet <Term> extractedTerms  = new JCG.SortedSet <Term>();
                q.ExtractTerms(extractedTerms);
                foreach (Term term in extractedTerms)
                {
                    termContexts[term] = TermContext.Build(context, term);
                }
                IBits       acceptDocs = context.AtomicReader.LiveDocs;
                Spans.Spans spans      = q.GetSpans(context, acceptDocs, termContexts);

                // collect span positions
                while (spans.MoveNext())
                {
                    spanPositions.Add(new PositionSpan(spans.Start, spans.End - 1));
                }
            }

            if (spanPositions.Count == 0)
            {
                // no spans found
                return;
            }

            foreach (Term queryTerm in nonWeightedTerms)
            {
                if (FieldNameComparer(queryTerm.Field))
                {
                    WeightedSpanTerm weightedSpanTerm;
                    if (!terms.TryGetValue(queryTerm.Text(), out weightedSpanTerm) || weightedSpanTerm == null)
                    {
                        weightedSpanTerm = new WeightedSpanTerm(spanQuery.Boost, queryTerm.Text());
                        weightedSpanTerm.AddPositionSpans(spanPositions);
                        weightedSpanTerm.IsPositionSensitive = true;
                        terms[queryTerm.Text()] = weightedSpanTerm;
                    }
                    else
                    {
                        if (spanPositions.Count > 0)
                        {
                            weightedSpanTerm.AddPositionSpans(spanPositions);
                        }
                    }
                }
            }
        }
Exemplo n.º 15
0
        /// <summary>
        /// Reads from legacy 3.x segments_N. </summary>
        private SegmentCommitInfo ReadLegacySegmentInfo(Directory dir, int format, IndexInput input)
        {
            // check that it is a format we can understand
            if (format > Lucene3xSegmentInfoFormat.FORMAT_DIAGNOSTICS)
            {
                throw new IndexFormatTooOldException(input, format, Lucene3xSegmentInfoFormat.FORMAT_DIAGNOSTICS, Lucene3xSegmentInfoFormat.FORMAT_3_1);
            }
            if (format < Lucene3xSegmentInfoFormat.FORMAT_3_1)
            {
                throw new IndexFormatTooNewException(input, format, Lucene3xSegmentInfoFormat.FORMAT_DIAGNOSTICS, Lucene3xSegmentInfoFormat.FORMAT_3_1);
            }
            string version;

            if (format <= Lucene3xSegmentInfoFormat.FORMAT_3_1)
            {
                version = input.ReadString();
            }
            else
            {
                version = null;
            }

            string name = input.ReadString();

            int  docCount = input.ReadInt32();
            long delGen   = input.ReadInt64();

            int docStoreOffset = input.ReadInt32();
            IDictionary <string, string> attributes = new Dictionary <string, string>();

            // parse the docstore stuff and shove it into attributes
            string docStoreSegment;
            bool   docStoreIsCompoundFile;

            if (docStoreOffset != -1)
            {
                docStoreSegment        = input.ReadString();
                docStoreIsCompoundFile = input.ReadByte() == SegmentInfo.YES;
                attributes[Lucene3xSegmentInfoFormat.DS_OFFSET_KEY]   = Convert.ToString(docStoreOffset, CultureInfo.InvariantCulture);
                attributes[Lucene3xSegmentInfoFormat.DS_NAME_KEY]     = docStoreSegment;
                attributes[Lucene3xSegmentInfoFormat.DS_COMPOUND_KEY] = Convert.ToString(docStoreIsCompoundFile, CultureInfo.InvariantCulture);
            }
            else
            {
                docStoreSegment        = name;
                docStoreIsCompoundFile = false;
            }

            // pre-4.0 indexes write a byte if there is a single norms file
            byte b = input.ReadByte();

            //System.out.println("version=" + version + " name=" + name + " docCount=" + docCount + " delGen=" + delGen + " dso=" + docStoreOffset + " dss=" + docStoreSegment + " dssCFs=" + docStoreIsCompoundFile + " b=" + b + " format=" + format);

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(1 == b, "expected 1 but was: {0} format: {1}", b, format);
            }
            int numNormGen = input.ReadInt32();
            IDictionary <int, long> normGen;

            if (numNormGen == SegmentInfo.NO)
            {
                normGen = null;
            }
            else
            {
                normGen = new Dictionary <int, long>();
                for (int j = 0; j < numNormGen; j++)
                {
                    normGen[j] = input.ReadInt64();
                }
            }
            bool isCompoundFile = input.ReadByte() == SegmentInfo.YES;

            int delCount = input.ReadInt32();

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(delCount <= docCount);
            }

            //bool hasProx = input.ReadByte() == 1;
            input.ReadByte(); // LUCENENET: IDE0059: Remove unnecessary value assignment

            IDictionary <string, string> diagnostics = input.ReadStringStringMap();

            if (format <= Lucene3xSegmentInfoFormat.FORMAT_HAS_VECTORS)
            {
                // NOTE: unused
                //int hasVectors = input.ReadByte();
                input.ReadByte(); // LUCENENET: IDE0059: Remove unnecessary value assignment
            }

            // Replicate logic from 3.x's SegmentInfo.files():
            ISet <string> files = new JCG.HashSet <string>();

            if (isCompoundFile)
            {
                files.Add(IndexFileNames.SegmentFileName(name, "", IndexFileNames.COMPOUND_FILE_EXTENSION));
            }
            else
            {
                AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xFieldInfosReader.FIELD_INFOS_EXTENSION));
                AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xPostingsFormat.FREQ_EXTENSION));
                AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xPostingsFormat.PROX_EXTENSION));
                AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xPostingsFormat.TERMS_EXTENSION));
                AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xPostingsFormat.TERMS_INDEX_EXTENSION));
                AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xNormsProducer.NORMS_EXTENSION));
            }

            if (docStoreOffset != -1)
            {
                if (docStoreIsCompoundFile)
                {
                    files.Add(IndexFileNames.SegmentFileName(docStoreSegment, "", Lucene3xCodec.COMPOUND_FILE_STORE_EXTENSION));
                }
                else
                {
                    files.Add(IndexFileNames.SegmentFileName(docStoreSegment, "", Lucene3xStoredFieldsReader.FIELDS_INDEX_EXTENSION));
                    files.Add(IndexFileNames.SegmentFileName(docStoreSegment, "", Lucene3xStoredFieldsReader.FIELDS_EXTENSION));
                    AddIfExists(dir, files, IndexFileNames.SegmentFileName(docStoreSegment, "", Lucene3xTermVectorsReader.VECTORS_INDEX_EXTENSION));
                    AddIfExists(dir, files, IndexFileNames.SegmentFileName(docStoreSegment, "", Lucene3xTermVectorsReader.VECTORS_FIELDS_EXTENSION));
                    AddIfExists(dir, files, IndexFileNames.SegmentFileName(docStoreSegment, "", Lucene3xTermVectorsReader.VECTORS_DOCUMENTS_EXTENSION));
                }
            }
            else if (!isCompoundFile)
            {
                files.Add(IndexFileNames.SegmentFileName(name, "", Lucene3xStoredFieldsReader.FIELDS_INDEX_EXTENSION));
                files.Add(IndexFileNames.SegmentFileName(name, "", Lucene3xStoredFieldsReader.FIELDS_EXTENSION));
                AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xTermVectorsReader.VECTORS_INDEX_EXTENSION));
                AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xTermVectorsReader.VECTORS_FIELDS_EXTENSION));
                AddIfExists(dir, files, IndexFileNames.SegmentFileName(name, "", Lucene3xTermVectorsReader.VECTORS_DOCUMENTS_EXTENSION));
            }

            // parse the normgen stuff and shove it into attributes
            if (normGen != null)
            {
                attributes[Lucene3xSegmentInfoFormat.NORMGEN_KEY] = Convert.ToString(numNormGen, CultureInfo.InvariantCulture);
                foreach (KeyValuePair <int, long> ent in normGen)
                {
                    long gen = ent.Value;
                    if (gen >= SegmentInfo.YES)
                    {
                        // Definitely a separate norm file, with generation:
                        files.Add(IndexFileNames.FileNameFromGeneration(name, "s" + ent.Key, gen));
                        attributes[Lucene3xSegmentInfoFormat.NORMGEN_PREFIX + ent.Key] = Convert.ToString(gen, CultureInfo.InvariantCulture);
                    }
                    else if (gen == SegmentInfo.NO)
                    {
                        // No separate norm
                    }
                    else
                    {
                        // We should have already hit indexformat too old exception
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(false);
                        }
                    }
                }
            }

            SegmentInfo info = new SegmentInfo(dir, version, name, docCount, isCompoundFile, null, diagnostics, attributes.AsReadOnly());

            info.SetFiles(files);

            SegmentCommitInfo infoPerCommit = new SegmentCommitInfo(info, delCount, delGen, -1);

            return(infoPerCommit);
        }
Exemplo n.º 16
0
        public virtual void TestRandomStringSort()
        {
            Random random = new Random(Random.Next());

            int               NUM_DOCS = AtLeast(100);
            Directory         dir      = NewDirectory();
            RandomIndexWriter writer   = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                random, dir);
            bool          allowDups = random.NextBoolean();
            ISet <string> seen      = new JCG.HashSet <string>();
            int           maxLength = TestUtil.NextInt32(random, 5, 100);

            if (VERBOSE)
            {
                Console.WriteLine("TEST: NUM_DOCS=" + NUM_DOCS + " maxLength=" + maxLength + " allowDups=" + allowDups);
            }

            int numDocs = 0;
            IList <BytesRef> docValues = new List <BytesRef>();

            // TODO: deletions
            while (numDocs < NUM_DOCS)
            {
                Document doc = new Document();

                // 10% of the time, the document is missing the value:
                BytesRef br;
                if (LuceneTestCase.Random.Next(10) != 7)
                {
                    string s;
                    if (random.NextBoolean())
                    {
                        s = TestUtil.RandomSimpleString(random, maxLength);
                    }
                    else
                    {
                        s = TestUtil.RandomUnicodeString(random, maxLength);
                    }

                    if (!allowDups)
                    {
                        if (seen.Contains(s))
                        {
                            continue;
                        }
                        seen.Add(s);
                    }

                    if (VERBOSE)
                    {
                        Console.WriteLine("  " + numDocs + ": s=" + s);
                    }

                    br = new BytesRef(s);
                    if (DefaultCodecSupportsDocValues)
                    {
                        doc.Add(new SortedDocValuesField("stringdv", br));
                        doc.Add(new NumericDocValuesField("id", numDocs));
                    }
                    else
                    {
                        doc.Add(NewStringField("id", Convert.ToString(numDocs), Field.Store.NO));
                    }
                    doc.Add(NewStringField("string", s, Field.Store.NO));
                    docValues.Add(br);
                }
                else
                {
                    br = null;
                    if (VERBOSE)
                    {
                        Console.WriteLine("  " + numDocs + ": <missing>");
                    }
                    docValues.Add(null);
                    if (DefaultCodecSupportsDocValues)
                    {
                        doc.Add(new NumericDocValuesField("id", numDocs));
                    }
                    else
                    {
                        doc.Add(NewStringField("id", Convert.ToString(numDocs), Field.Store.NO));
                    }
                }

                doc.Add(new StoredField("id", numDocs));
                writer.AddDocument(doc);
                numDocs++;

                if (random.Next(40) == 17)
                {
                    // force flush
                    writer.GetReader().Dispose();
                }
            }

            IndexReader r = writer.GetReader();

            writer.Dispose();
            if (VERBOSE)
            {
                Console.WriteLine("  reader=" + r);
            }

            IndexSearcher idxS = NewSearcher(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                r, false);
            int ITERS = AtLeast(100);

            for (int iter = 0; iter < ITERS; iter++)
            {
                bool reverse = random.NextBoolean();

                TopFieldDocs hits;
                SortField    sf;
                bool         sortMissingLast;
                bool         missingIsNull;
                if (DefaultCodecSupportsDocValues && random.NextBoolean())
                {
                    sf = new SortField("stringdv", SortFieldType.STRING, reverse);
                    // Can only use sort missing if the DVFormat
                    // supports docsWithField:
                    sortMissingLast = DefaultCodecSupportsDocsWithField && Random.NextBoolean();
                    missingIsNull   = DefaultCodecSupportsDocsWithField;
                }
                else
                {
                    sf = new SortField("string", SortFieldType.STRING, reverse);
                    sortMissingLast = Random.NextBoolean();
                    missingIsNull   = true;
                }
                if (sortMissingLast)
                {
                    sf.MissingValue = SortField.STRING_LAST;
                }

                Sort sort;
                if (random.NextBoolean())
                {
                    sort = new Sort(sf);
                }
                else
                {
                    sort = new Sort(sf, SortField.FIELD_DOC);
                }
                int          hitCount  = TestUtil.NextInt32(random, 1, r.MaxDoc + 20);
                RandomFilter f         = new RandomFilter(random, (float)random.NextDouble(), docValues);
                int          queryType = random.Next(3);
                if (queryType == 0)
                {
                    // force out of order
                    BooleanQuery bq = new BooleanQuery();
                    // Add a Query with SHOULD, since bw.Scorer() returns BooleanScorer2
                    // which delegates to BS if there are no mandatory clauses.
                    bq.Add(new MatchAllDocsQuery(), Occur.SHOULD);
                    // Set minNrShouldMatch to 1 so that BQ will not optimize rewrite to return
                    // the clause instead of BQ.
                    bq.MinimumNumberShouldMatch = 1;
                    hits = idxS.Search(bq, f, hitCount, sort, random.NextBoolean(), random.NextBoolean());
                }
                else if (queryType == 1)
                {
                    hits = idxS.Search(new ConstantScoreQuery(f), null, hitCount, sort, random.NextBoolean(), random.NextBoolean());
                }
                else
                {
                    hits = idxS.Search(new MatchAllDocsQuery(), f, hitCount, sort, random.NextBoolean(), random.NextBoolean());
                }

                if (VERBOSE)
                {
                    Console.WriteLine("\nTEST: iter=" + iter + " " + hits.TotalHits + " hits; topN=" + hitCount + "; reverse=" + reverse + "; sortMissingLast=" + sortMissingLast + " sort=" + sort);
                }

                // Compute expected results:
                var expected = f.MatchValues.ToList();
                expected.Sort(new ComparerAnonymousInnerClassHelper(this, sortMissingLast));
                if (reverse)
                {
                    expected.Reverse();
                }

                if (VERBOSE)
                {
                    Console.WriteLine("  expected:");
                    for (int idx = 0; idx < expected.Count; idx++)
                    {
                        BytesRef br = expected[idx];
                        if (br == null && missingIsNull == false)
                        {
                            br = new BytesRef();
                        }
                        Console.WriteLine("    " + idx + ": " + (br == null ? "<missing>" : br.Utf8ToString()));
                        if (idx == hitCount - 1)
                        {
                            break;
                        }
                    }
                }

                if (VERBOSE)
                {
                    Console.WriteLine("  actual:");
                    for (int hitIDX = 0; hitIDX < hits.ScoreDocs.Length; hitIDX++)
                    {
                        FieldDoc fd = (FieldDoc)hits.ScoreDocs[hitIDX];
                        BytesRef br = (BytesRef)fd.Fields[0];

                        Console.WriteLine("    " + hitIDX + ": " + (br == null ? "<missing>" : br.Utf8ToString()) + " id=" + idxS.Doc(fd.Doc).Get("id"));
                    }
                }
                for (int hitIDX = 0; hitIDX < hits.ScoreDocs.Length; hitIDX++)
                {
                    FieldDoc fd = (FieldDoc)hits.ScoreDocs[hitIDX];
                    BytesRef br = expected[hitIDX];
                    if (br == null && missingIsNull == false)
                    {
                        br = new BytesRef();
                    }

                    // Normally, the old codecs (that don't support
                    // docsWithField via doc values) will always return
                    // an empty BytesRef for the missing case; however,
                    // if all docs in a given segment were missing, in
                    // that case it will return null!  So we must map
                    // null here, too:
                    BytesRef br2 = (BytesRef)fd.Fields[0];
                    if (br2 == null && missingIsNull == false)
                    {
                        br2 = new BytesRef();
                    }

                    Assert.AreEqual(br, br2, "hit=" + hitIDX + " has wrong sort value");
                }
            }

            r.Dispose();
            dir.Dispose();
        }
Exemplo n.º 17
0
            public override void Run()
            {
                try
                {
                    var            seen  = new JCG.HashSet <string>();
                    IList <string> paths = new List <string>();
                    while (true)
                    {
                        Document doc      = new Document();
                        int      numPaths = TestUtil.NextInt32(Random, 1, 5);
                        for (int i = 0; i < numPaths; i++)
                        {
                            string path;
                            if (paths.Count > 0 && Random.Next(5) != 4)
                            {
                                // Use previous path
                                path = paths[Random.Next(paths.Count)];
                            }
                            else
                            {
                                // Create new path
                                path = null;
                                while (true)
                                {
                                    path = TestUtil.RandomRealisticUnicodeString(Random);
                                    if (path.Length != 0 && !seen.Contains(path))
                                    {
                                        seen.Add(path);
                                        paths.Add(path);
                                        break;
                                    }
                                }
                            }
                            doc.Add(new FacetField("field", path));
                        }
                        try
                        {
                            w.AddDocument(config.Build(tw, doc));
                            if (mgr != null && Random.NextDouble() < 0.02)
                            {
                                w.Commit();
                                tw.Commit();
                                mgr.MaybeRefresh();
                            }
                        }
                        catch (IOException ioe)
                        {
                            throw new Exception(ioe.ToString(), ioe);
                        }

                        if (VERBOSE)
                        {
                            Console.WriteLine("TW size=" + tw.Count + " vs " + ordLimit);
                        }

                        if (tw.Count >= ordLimit)
                        {
                            break;
                        }
                    }
                }
                finally
                {
                    stop.Value = true;
                }
            }
        public virtual void Test()
        {
            MockDirectoryWrapper dir = NewMockFSDirectory(CreateTempDir("TestIndexWriterOutOfFileDescriptors"));

            dir.PreventDoubleWrite = false;
            double rate = Random.NextDouble() * 0.01;

            //System.out.println("rate=" + rate);
            dir.RandomIOExceptionRateOnOpen = rate;
            int                  iters       = AtLeast(20);
            LineFileDocs         docs        = new LineFileDocs(Random, DefaultCodecSupportsDocValues);
            IndexReader          r           = null;
            DirectoryReader      r2          = null;
            bool                 any         = false;
            MockDirectoryWrapper dirCopy     = null;
            int                  lastNumDocs = 0;

            for (int iter = 0; iter < iters; iter++)
            {
                IndexWriter w = null;
                if (VERBOSE)
                {
                    Console.WriteLine("TEST: iter=" + iter);
                }
                try
                {
                    MockAnalyzer analyzer = new MockAnalyzer(Random);
                    analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH);
                    IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);

                    if (VERBOSE)
                    {
                        // Do this ourselves instead of relying on LTC so
                        // we see incrementing messageID:
                        iwc.SetInfoStream(new TextWriterInfoStream(Console.Out));
                    }
                    var ms = iwc.MergeScheduler;
                    if (ms is IConcurrentMergeScheduler)
                    {
                        ((IConcurrentMergeScheduler)ms).SetSuppressExceptions();
                    }
                    w = new IndexWriter(dir, iwc);
                    if (r != null && Random.Next(5) == 3)
                    {
                        if (Random.NextBoolean())
                        {
                            if (VERBOSE)
                            {
                                Console.WriteLine("TEST: addIndexes IR[]");
                            }
                            w.AddIndexes(new IndexReader[] { r });
                        }
                        else
                        {
                            if (VERBOSE)
                            {
                                Console.WriteLine("TEST: addIndexes Directory[]");
                            }
                            w.AddIndexes(new Directory[] { dirCopy });
                        }
                    }
                    else
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine("TEST: addDocument");
                        }
                        w.AddDocument(docs.NextDoc());
                    }
                    dir.RandomIOExceptionRateOnOpen = 0.0;
                    w.Dispose();
                    w = null;

                    // NOTE: this is O(N^2)!  Only enable for temporary debugging:
                    //dir.setRandomIOExceptionRateOnOpen(0.0);
                    //TestUtil.CheckIndex(dir);
                    //dir.setRandomIOExceptionRateOnOpen(rate);

                    // Verify numDocs only increases, to catch IndexWriter
                    // accidentally deleting the index:
                    dir.RandomIOExceptionRateOnOpen = 0.0;
                    Assert.IsTrue(DirectoryReader.IndexExists(dir));
                    if (r2 == null)
                    {
                        r2 = DirectoryReader.Open(dir);
                    }
                    else
                    {
                        DirectoryReader r3 = DirectoryReader.OpenIfChanged(r2);
                        if (r3 != null)
                        {
                            r2.Dispose();
                            r2 = r3;
                        }
                    }
                    Assert.IsTrue(r2.NumDocs >= lastNumDocs, "before=" + lastNumDocs + " after=" + r2.NumDocs);
                    lastNumDocs = r2.NumDocs;
                    //System.out.println("numDocs=" + lastNumDocs);
                    dir.RandomIOExceptionRateOnOpen = rate;

                    any = true;
                    if (VERBOSE)
                    {
                        Console.WriteLine("TEST: iter=" + iter + ": success");
                    }
                }
                catch (IOException ioe)
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine("TEST: iter=" + iter + ": exception");
                        Console.WriteLine(ioe.ToString());
                        Console.Write(ioe.StackTrace);
                    }
                    if (w != null)
                    {
                        // NOTE: leave random IO exceptions enabled here,
                        // to verify that rollback does not try to write
                        // anything:
                        w.Rollback();
                    }
                }

                if (any && r == null && Random.NextBoolean())
                {
                    // Make a copy of a non-empty index so we can use
                    // it to addIndexes later:
                    dir.RandomIOExceptionRateOnOpen = 0.0;
                    r       = DirectoryReader.Open(dir);
                    dirCopy = NewMockFSDirectory(CreateTempDir("TestIndexWriterOutOfFileDescriptors.copy"));
                    ISet <string> files = new JCG.HashSet <string>();
                    foreach (string file in dir.ListAll())
                    {
                        dir.Copy(dirCopy, file, file, IOContext.DEFAULT);
                        files.Add(file);
                    }
                    dirCopy.Sync(files);
                    // Have IW kiss the dir so we remove any leftover
                    // files ... we can easily have leftover files at
                    // the time we take a copy because we are holding
                    // open a reader:
                    (new IndexWriter(dirCopy, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)))).Dispose();
                    dirCopy.RandomIOExceptionRate   = rate;
                    dir.RandomIOExceptionRateOnOpen = rate;
                }
            }

            if (r2 != null)
            {
                r2.Dispose();
            }
            if (r != null)
            {
                r.Dispose();
                dirCopy.Dispose();
            }
            dir.Dispose();
        }
Exemplo n.º 19
0
        public override void Build(IInputIterator iterator)
        {
            if (iterator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            string prefix     = this.GetType().Name;
            var    directory  = OfflineSorter.DefaultTempDir();
            var    tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            var    tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            hasPayloads = iterator.HasPayloads;

            var writer = new OfflineSorter.ByteSequencesWriter(tempInput);

            OfflineSorter.ByteSequencesReader reader = null;
            var scratch = new BytesRef();

            TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton();

            bool success = false;

            count = 0;
            byte[] buffer = new byte[8];
            try
            {
                var      output = new ByteArrayDataOutput(buffer);
                BytesRef surfaceForm;

                while ((surfaceForm = iterator.Next()) != null)
                {
                    ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a);

                    maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count);

                    foreach (Int32sRef path in paths)
                    {
                        Util.Fst.Util.ToBytesRef(path, scratch);

                        // length of the analyzed text (FST input)
                        if (scratch.Length > ushort.MaxValue - 2)
                        {
                            throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) +
                                                        " in length (got " + scratch.Length + ")");
                        }
                        ushort analyzedLength = (ushort)scratch.Length;

                        // compute the required length:
                        // analyzed sequence + weight (4) + surface + analyzedLength (short)
                        int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2;

                        BytesRef payload;

                        if (hasPayloads)
                        {
                            if (surfaceForm.Length > (ushort.MaxValue - 2))
                            {
                                throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) +
                                                            " in length (got " + surfaceForm.Length + ")");
                            }
                            payload = iterator.Payload;
                            // payload + surfaceLength (short)
                            requiredLength += payload.Length + 2;
                        }
                        else
                        {
                            payload = null;
                        }

                        buffer = ArrayUtil.Grow(buffer, requiredLength);

                        output.Reset(buffer);

                        output.WriteInt16((short)analyzedLength);

                        output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length);

                        output.WriteInt32(EncodeWeight(iterator.Weight));

                        if (hasPayloads)
                        {
                            for (int i = 0; i < surfaceForm.Length; i++)
                            {
                                if (surfaceForm.Bytes[i] == PAYLOAD_SEP)
                                {
                                    throw new ArgumentException(
                                              "surface form cannot contain unit separator character U+001F; this character is reserved");
                                }
                            }
                            output.WriteInt16((short)surfaceForm.Length);
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                            output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                        }
                        else
                        {
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                        }

                        Debug.Assert(output.Position == requiredLength, output.Position + " vs " + requiredLength);

                        writer.Write(buffer, 0, output.Position);
                    }
                    count++;
                }
                writer.Dispose();

                // Sort all input/output pairs (required by FST.Builder):
                (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted);

                // Free disk space:
                tempInput.Delete();

                reader = new OfflineSorter.ByteSequencesReader(tempSorted);

                var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton,
                                                                ByteSequenceOutputs.Singleton);
                var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);

                // Build FST:
                BytesRef  previousAnalyzed = null;
                BytesRef  analyzed         = new BytesRef();
                BytesRef  surface          = new BytesRef();
                Int32sRef scratchInts      = new Int32sRef();
                var       input            = new ByteArrayDataInput();

                // Used to remove duplicate surface forms (but we
                // still index the hightest-weight one).  We clear
                // this when we see a new analyzed form, so it cannot
                // grow unbounded (at most 256 entries):
                var seenSurfaceForms = new JCG.HashSet <BytesRef>();

                var dedup = 0;
                while (reader.Read(scratch))
                {
                    input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
                    ushort analyzedLength = (ushort)input.ReadInt16();
                    analyzed.Grow(analyzedLength + 2);
                    input.ReadBytes(analyzed.Bytes, 0, analyzedLength);
                    analyzed.Length = analyzedLength;

                    long cost = input.ReadInt32();

                    surface.Bytes = scratch.Bytes;
                    if (hasPayloads)
                    {
                        surface.Length = (ushort)input.ReadInt16();
                        surface.Offset = input.Position;
                    }
                    else
                    {
                        surface.Offset = input.Position;
                        surface.Length = scratch.Length - surface.Offset;
                    }

                    if (previousAnalyzed == null)
                    {
                        previousAnalyzed = new BytesRef();
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else if (analyzed.Equals(previousAnalyzed))
                    {
                        dedup++;
                        if (dedup >= maxSurfaceFormsPerAnalyzedForm)
                        {
                            // More than maxSurfaceFormsPerAnalyzedForm
                            // dups: skip the rest:
                            continue;
                        }
                        if (seenSurfaceForms.Contains(surface))
                        {
                            continue;
                        }
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else
                    {
                        dedup = 0;
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Clear();
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }

                    // TODO: I think we can avoid the extra 2 bytes when
                    // there is no dup (dedup==0), but we'd have to fix
                    // the exactFirst logic ... which would be sort of
                    // hairy because we'd need to special case the two
                    // (dup/not dup)...

                    // NOTE: must be byte 0 so we sort before whatever
                    // is next
                    analyzed.Bytes[analyzed.Offset + analyzed.Length]     = 0;
                    analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup;
                    analyzed.Length += 2;

                    Util.Fst.Util.ToInt32sRef(analyzed, scratchInts);
                    //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
                    if (!hasPayloads)
                    {
                        builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface)));
                    }
                    else
                    {
                        int      payloadOffset = input.Position + surface.Length;
                        int      payloadLength = scratch.Length - payloadOffset;
                        BytesRef br            = new BytesRef(surface.Length + 1 + payloadLength);
                        Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length);
                        br.Bytes[surface.Length] = PAYLOAD_SEP;
                        Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength);
                        br.Length = br.Bytes.Length;
                        builder.Add(scratchInts, outputs.NewPair(cost, br));
                    }
                }
                fst = builder.Finish();

                //Util.dotToFile(fst, "/tmp/suggest.dot");

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(reader, writer);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(reader, writer);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
Exemplo n.º 20
0
        /// <summary>
        /// Detect repetition groups. Done once - for first doc. </summary>
        private IList <IList <PhrasePositions> > GatherRptGroups(JCG.LinkedDictionary <Term, int?> rptTerms)
        {
            PhrasePositions[] rpp = RepeatingPPs(rptTerms);
            IList <IList <PhrasePositions> > res = new List <IList <PhrasePositions> >();

            if (!hasMultiTermRpts)
            {
                // simpler - no multi-terms - can base on positions in first doc
                for (int i = 0; i < rpp.Length; i++)
                {
                    PhrasePositions pp = rpp[i];
                    if (pp.rptGroup >= 0) // already marked as a repetition
                    {
                        continue;
                    }
                    int tpPos = TpPos(pp);
                    for (int j = i + 1; j < rpp.Length; j++)
                    {
                        PhrasePositions pp2 = rpp[j];
                        if (pp2.rptGroup >= 0 || pp2.offset == pp.offset || TpPos(pp2) != tpPos) // not a repetition -  not a repetition: two PPs are originally in same offset in the query! -  already marked as a repetition
                        {
                            continue;
                        }
                        // a repetition
                        int g = pp.rptGroup;
                        if (g < 0)
                        {
                            g           = res.Count;
                            pp.rptGroup = g;
                            List <PhrasePositions> rl = new List <PhrasePositions>(2);
                            rl.Add(pp);
                            res.Add(rl);
                        }
                        pp2.rptGroup = g;
                        res[g].Add(pp2);
                    }
                }
            }
            else
            {
                // more involved - has multi-terms
                IList <JCG.HashSet <PhrasePositions> > tmp = new List <JCG.HashSet <PhrasePositions> >();
                IList <FixedBitSet> bb = PpTermsBitSets(rpp, rptTerms);
                UnionTermGroups(bb);
                IDictionary <Term, int> tg = TermGroups(rptTerms, bb);
                JCG.HashSet <int>       distinctGroupIDs = new JCG.HashSet <int>(tg.Values);
                for (int i = 0; i < distinctGroupIDs.Count; i++)
                {
                    tmp.Add(new JCG.HashSet <PhrasePositions>());
                }
                foreach (PhrasePositions pp in rpp)
                {
                    foreach (Term t in pp.terms)
                    {
                        if (rptTerms.ContainsKey(t))
                        {
                            int g = tg[t];
                            tmp[g].Add(pp);
                            Debug.Assert(pp.rptGroup == -1 || pp.rptGroup == g);
                            pp.rptGroup = g;
                        }
                    }
                }
                foreach (JCG.HashSet <PhrasePositions> hs in tmp)
                {
                    res.Add(new List <PhrasePositions>(hs));
                }
            }
            return(res);
        }
Exemplo n.º 21
0
 /// <summary>
 /// Returns the strings that can be produced from the given state, or
 /// <c>false</c> if more than <paramref name="limit"/> strings are found.
 /// <paramref name="limit"/>&lt;0 means "infinite".
 /// </summary>
 private static bool GetFiniteStrings(State s, JCG.HashSet <State> pathstates, JCG.HashSet <Int32sRef> strings, Int32sRef path, int limit)
 {
     pathstates.Add(s);
     foreach (Transition t in s.GetTransitions())
     {
         if (pathstates.Contains(t.to))
         {
             return(false);
         }
         for (int n = t.min; n <= t.max; n++)
         {
             path.Grow(path.Length + 1);
             path.Int32s[path.Length] = n;
             path.Length++;
             if (t.to.accept)
             {
                 strings.Add(Int32sRef.DeepCopyOf(path));
                 if (limit >= 0 && strings.Count > limit)
                 {
                     return(false);
                 }
             }
             if (!GetFiniteStrings(t.to, pathstates, strings, path, limit))
             {
                 return(false);
             }
             path.Length--;
         }
     }
     pathstates.Remove(s);
     return(true);
 }
Exemplo n.º 22
0
        /// <summary>
        /// (non-Javadoc)
        /// @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
        /// </summary>
        public virtual Query GetQuery(XmlElement e)
        {
            string fieldsList = e.GetAttribute("fieldNames"); //a comma-delimited list of fields

            string[] fields = defaultFieldNames;
            if ((fieldsList != null) && (fieldsList.Trim().Length > 0))
            {
                fields = fieldsList.Trim().Split(',').TrimEnd();
                //trim the fieldnames
                for (int i = 0; i < fields.Length; i++)
                {
                    fields[i] = fields[i].Trim();
                }
            }

            //Parse any "stopWords" attribute
            //TODO MoreLikeThis needs to ideally have per-field stopWords lists - until then
            //I use all analyzers/fields to generate multi-field compatible stop list
            string        stopWords    = e.GetAttribute("stopWords");
            ISet <string> stopWordsSet = null;

            if ((stopWords != null) && (fields != null))
            {
                stopWordsSet = new JCG.HashSet <string>();
                foreach (string field in fields)
                {
                    TokenStream ts = null;
                    try
                    {
                        ts = analyzer.GetTokenStream(field, stopWords);
                        ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>();
                        ts.Reset();
                        while (ts.IncrementToken())
                        {
                            stopWordsSet.Add(termAtt.ToString());
                        }
                        ts.End();
                    }
                    catch (IOException ioe)
                    {
                        throw new ParserException("IoException parsing stop words list in "
                                                  + GetType().Name + ":" + ioe.Message);
                    }
                    finally
                    {
                        IOUtils.DisposeWhileHandlingException(ts);
                    }
                }
            }

            MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils.GetText(e), fields, analyzer, fields[0]);

            mlt.MaxQueryTerms       = DOMUtils.GetAttribute(e, "maxQueryTerms", DEFAULT_MAX_QUERY_TERMS);
            mlt.MinTermFrequency    = DOMUtils.GetAttribute(e, "minTermFrequency", DEFAULT_MIN_TERM_FREQUENCY);
            mlt.PercentTermsToMatch = DOMUtils.GetAttribute(e, "percentTermsToMatch", DEFAULT_PERCENT_TERMS_TO_MATCH) / 100;
            mlt.StopWords           = stopWordsSet;
            int minDocFreq = DOMUtils.GetAttribute(e, "minDocFreq", -1);

            if (minDocFreq >= 0)
            {
                mlt.MinDocFreq = minDocFreq;
            }

            mlt.Boost = DOMUtils.GetAttribute(e, "boost", 1.0f);

            return(mlt);
        }
Exemplo n.º 23
0
        public override void AddBinaryField(FieldInfo field, IEnumerable <BytesRef> values)
        {
            // examine the values to determine best type to use
            ISet <BytesRef> uniqueValues = new JCG.HashSet <BytesRef>();
            int             minLength    = int.MaxValue;
            int             maxLength    = int.MinValue;

            foreach (var value in values)
            {
                BytesRef b = value;
                if (b is null)
                {
                    b = new BytesRef(); // 4.0 doesnt distinguish
                }
                if (b.Length > Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH)
                {
                    throw new ArgumentException("DocValuesField \"" + field.Name + "\" is too large, must be <= " + Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH);
                }
                minLength = Math.Min(minLength, b.Length);
                maxLength = Math.Max(maxLength, b.Length);
                if (uniqueValues != null)
                {
                    if (uniqueValues.Add(BytesRef.DeepCopyOf(b)))
                    {
                        if (uniqueValues.Count > 256)
                        {
                            uniqueValues = null;
                        }
                    }
                }
            }

            int  maxDoc = state.SegmentInfo.DocCount;
            bool @fixed = minLength == maxLength;
            bool dedup  = uniqueValues != null && uniqueValues.Count * 2 < maxDoc;

            if (dedup)
            {
                // we will deduplicate and deref values
                bool        success   = false;
                IndexOutput data      = null;
                IndexOutput index     = null;
                string      dataName  = IndexFileNames.SegmentFileName(state.SegmentInfo.Name + "_" + Convert.ToString(field.Number, CultureInfo.InvariantCulture), segmentSuffix, "dat");
                string      indexName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name + "_" + Convert.ToString(field.Number, CultureInfo.InvariantCulture), segmentSuffix, "idx");
                try
                {
                    data  = dir.CreateOutput(dataName, state.Context);
                    index = dir.CreateOutput(indexName, state.Context);
                    if (@fixed)
                    {
                        AddFixedDerefBytesField(field, data, index, values, minLength);
                    }
                    else
                    {
                        AddVarDerefBytesField(field, data, index, values);
                    }
                    success = true;
                }
                finally
                {
                    if (success)
                    {
                        IOUtils.Dispose(data, index);
                    }
                    else
                    {
                        IOUtils.DisposeWhileHandlingException(data, index);
                    }
                }
            }
            else
            {
                // we dont deduplicate, just write values straight
                if (@fixed)
                {
                    // fixed byte[]
                    string      fileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name + "_" + Convert.ToString(field.Number, CultureInfo.InvariantCulture), segmentSuffix, "dat");
                    IndexOutput data     = dir.CreateOutput(fileName, state.Context);
                    bool        success  = false;
                    try
                    {
                        AddFixedStraightBytesField(field, data, values, minLength);
                        success = true;
                    }
                    finally
                    {
                        if (success)
                        {
                            IOUtils.Dispose(data);
                        }
                        else
                        {
                            IOUtils.DisposeWhileHandlingException(data);
                        }
                    }
                }
                else
                {
                    // variable byte[]
                    bool        success   = false;
                    IndexOutput data      = null;
                    IndexOutput index     = null;
                    string      dataName  = IndexFileNames.SegmentFileName(state.SegmentInfo.Name + "_" + Convert.ToString(field.Number, CultureInfo.InvariantCulture), segmentSuffix, "dat");
                    string      indexName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name + "_" + Convert.ToString(field.Number, CultureInfo.InvariantCulture), segmentSuffix, "idx");
                    try
                    {
                        data  = dir.CreateOutput(dataName, state.Context);
                        index = dir.CreateOutput(indexName, state.Context);
                        AddVarStraightBytesField(field, data, index, values);
                        success = true;
                    }
                    finally
                    {
                        if (success)
                        {
                            IOUtils.Dispose(data, index);
                        }
                        else
                        {
                            IOUtils.DisposeWhileHandlingException(data, index);
                        }
                    }
                }
            }
        }
Exemplo n.º 24
0
        private static JCG.HashSet <Type> LoadTypes() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
        {
            var types = new JCG.HashSet <Type>();

            var assembliesToExamine = Support.AssemblyUtils.GetReferencedAssemblies();

            // LUCENENET NOTE: The following hack is not required because we are using abstract factories
            // and pure DI to ensure the order of the codecs are always correct during testing.

            //// LUCENENET HACK:
            //// Tests such as TestImpersonation.cs expect that the assemblies
            //// are probed in a certain order. NamedSPILoader, lines 68 - 75 adds
            //// the first item it sees with that name. So if you have multiple
            //// codecs, it may not add the right one, depending on the order of
            //// the assemblies that were examined.
            //// This results in many test failures if Types from Lucene.Net.Codecs
            //// are examined and added to NamedSPILoader first before
            //// Lucene.Net.TestFramework.
            //var testFrameworkAssembly = assembliesToExamine.FirstOrDefault(x => string.Equals(x.GetName().Name, "Lucene.Net.TestFramework", StringComparison.Ordinal));
            //if (testFrameworkAssembly != null)
            //{
            //    //assembliesToExamine.Remove(testFrameworkAssembly);
            //    //assembliesToExamine.Insert(0, testFrameworkAssembly);
            //    assembliesToExamine = new Assembly[] { testFrameworkAssembly }.Concat(assembliesToExamine.Where(a => !testFrameworkAssembly.Equals(a)));
            //}

            foreach (var assembly in assembliesToExamine)
            {
                try
                {
                    foreach (var type in assembly.GetTypes().Where(x => x.IsPublic))
                    {
                        try
                        {
                            if (!IsInvokableSubclassOf <S>(type))
                            {
                                continue;
                            }

                            // We are looking for types with a default ctor
                            // (which is used in NamedSPILoader) or has a single parameter
                            // of type IDictionary<string, string> (for AnalysisSPILoader)
                            var matchingCtors = type.GetConstructors().Where(ctor =>
                            {
                                var parameters = ctor.GetParameters();

                                switch (parameters.Length)
                                {
                                case 0:            // default ctor
                                    return(false); // LUCENENET NOTE: Now that we have factored Codecs into Abstract Factories, we don't need default constructors here

                                case 1:
                                    return(typeof(IDictionary <string, string>).IsAssignableFrom(parameters[0].ParameterType));

                                default:
                                    return(false);
                                }
                            });

                            if (matchingCtors.Any())
                            {
                                types.Add(type);
                            }
                        }
                        catch
                        {
                            // swallow
                        }
                    }
                }
                catch
                {
                    // swallow
                }
            }
            return(types);
        }
Exemplo n.º 25
0
        private IndexContext CreateIndexContext()
        {
            Random random = Random;

            DocValuesType[] dvTypes = new DocValuesType[] {
                DocValuesType.BINARY,
                DocValuesType.SORTED
            };

            Directory         dir = NewDirectory();
            RandomIndexWriter w   = new RandomIndexWriter(
                random,
                dir,
                NewIndexWriterConfig(TEST_VERSION_CURRENT,
                                     new MockAnalyzer(random)).SetMergePolicy(NewLogMergePolicy())
                );

            bool          canUseDV = !"Lucene3x".Equals(w.IndexWriter.Config.Codec.Name, StringComparison.Ordinal);
            DocValuesType dvType   = canUseDV ? dvTypes[random.nextInt(dvTypes.Length)] : DocValuesType.NONE;

            int numDocs = 86 + random.nextInt(1087) * RandomMultiplier;

            string[] groupValues = new string[numDocs / 5];
            string[] countValues = new string[numDocs / 10];
            for (int i = 0; i < groupValues.Length; i++)
            {
                groupValues[i] = GenerateRandomNonEmptyString();
            }
            for (int i = 0; i < countValues.Length; i++)
            {
                countValues[i] = GenerateRandomNonEmptyString();
            }

            JCG.List <string> contentStrings = new JCG.List <string>();
            IDictionary <string, IDictionary <string, ISet <string> > > searchTermToGroupCounts = new JCG.Dictionary <string, IDictionary <string, ISet <string> > >();

            for (int i = 1; i <= numDocs; i++)
            {
                string groupValue = random.nextInt(23) == 14 ? null : groupValues[random.nextInt(groupValues.Length)];
                string countValue = random.nextInt(21) == 13 ? null : countValues[random.nextInt(countValues.Length)];
                string content    = "random" + random.nextInt(numDocs / 20);
                if (!searchTermToGroupCounts.TryGetValue(content, out var groupToCounts))
                {
                    // Groups sort always DOCID asc...
                    searchTermToGroupCounts.Add(content, groupToCounts = new JCG.LinkedDictionary <string, ISet <string> >());
                    contentStrings.Add(content);
                }

                if (!groupToCounts.TryGetValue(groupValue, out var countsVals))
                {
                    groupToCounts.Add(groupValue, countsVals = new JCG.HashSet <string>());
                }
                countsVals.Add(countValue);

                Document doc = new Document();
                doc.Add(new StringField("id", string.Format(CultureInfo.InvariantCulture, "{0:D9}", i), Field.Store.YES));
                if (groupValue != null)
                {
                    AddField(doc, groupField, groupValue, dvType);
                }
                if (countValue != null)
                {
                    AddField(doc, countField, countValue, dvType);
                }
                doc.Add(new TextField("content", content, Field.Store.YES));
                w.AddDocument(doc);
            }

            DirectoryReader reader = w.GetReader();

            if (Verbose)
            {
                for (int docID = 0; docID < reader.MaxDoc; docID++)
                {
                    Document doc = reader.Document(docID);
                    Console.WriteLine("docID=" + docID + " id=" + doc.Get("id") + " content=" + doc.Get("content") + " author=" + doc.Get("author") + " publisher=" + doc.Get("publisher"));
                }
            }

            w.Dispose();
            return(new IndexContext(dir, reader, dvType, searchTermToGroupCounts, contentStrings.ToArray(/*new String[contentStrings.size()]*/)));
        }
Exemplo n.º 26
0
 public SingleInstanceLock(JCG.HashSet <string> locks, string lockName)
 {
     this.locks    = locks;
     this.lockName = lockName;
 }
        internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage)
        {
            meta.WriteVInt32(field.Number);
            meta.WriteByte((byte)Lucene42DocValuesProducer.NUMBER);
            meta.WriteInt64(data.GetFilePointer());
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd      = 0;
            // TODO: more efficient?
            ISet <long> uniqueValues = null;

            if (optimizeStorage)
            {
                uniqueValues = new JCG.HashSet <long>();

                long count = 0;
                foreach (long?nv in values)
                {
                    // TODO: support this as MemoryDVFormat (and be smart about missing maybe)
                    long v = nv.GetValueOrDefault();

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(count == maxDoc);
                }
            }

            if (uniqueValues != null)
            {
                // small number of unique values
                int           bitsPerValue  = PackedInt32s.BitsRequired(uniqueValues.Count - 1);
                FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio);
                if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue)
                {
                    meta.WriteByte((byte)Lucene42DocValuesProducer.UNCOMPRESSED); // uncompressed
                    foreach (long?nv in values)
                    {
                        data.WriteByte((byte)nv.GetValueOrDefault());
                    }
                }
                else
                {
                    meta.WriteByte((byte)Lucene42DocValuesProducer.TABLE_COMPRESSED); // table-compressed
                    var decode = new long[uniqueValues.Count];
                    uniqueValues.CopyTo(decode, 0);
                    var encode = new Dictionary <long, int>();
                    data.WriteVInt32(decode.Length);
                    for (int i = 0; i < decode.Length; i++)
                    {
                        data.WriteInt64(decode[i]);
                        encode[decode[i]] = i;
                    }

                    meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                    data.WriteVInt32(formatAndBits.Format.Id);
                    data.WriteVInt32(formatAndBits.BitsPerValue);

                    PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE);
                    foreach (long?nv in values)
                    {
                        writer.Add(encode[nv.GetValueOrDefault()]);
                    }
                    writer.Finish();
                }
            }
            else if (gcd != 0 && gcd != 1)
            {
                meta.WriteByte((byte)Lucene42DocValuesProducer.GCD_COMPRESSED);
                meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                data.WriteInt64(minValue);
                data.WriteInt64(gcd);
                data.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE);

                BlockPackedWriter writer = new BlockPackedWriter(data, Lucene42DocValuesProducer.BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    writer.Add((nv.GetValueOrDefault() - minValue) / gcd);
                }
                writer.Finish();
            }
            else
            {
                meta.WriteByte((byte)Lucene42DocValuesProducer.DELTA_COMPRESSED); // delta-compressed

                meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                data.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE);

                BlockPackedWriter writer = new BlockPackedWriter(data, Lucene42DocValuesProducer.BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    writer.Add(nv.GetValueOrDefault());
                }
                writer.Finish();
            }
        }
Exemplo n.º 28
0
        public virtual void Test()
        {
            int NUM_DOCS = AtLeast(1000);

            Directory         dir = NewDirectory();
            RandomIndexWriter w   = null;

            int docsLeftInthisSegment = 0;

            int docUpto = 0;

            while (docUpto < NUM_DOCS)
            {
                if (VERBOSE)
                {
                    Console.WriteLine("TEST: " + docUpto + " of " + NUM_DOCS);
                }
                if (docsLeftInthisSegment == 0)
                {
                    IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));
                    if (Random.NextBoolean())
                    {
                        // Make sure we aggressively mix in SimpleText
                        // since it has different impls for all codec
                        // formats...
                        iwc.SetCodec(Codec.ForName("Lucene46"));
                    }
                    if (w != null)
                    {
                        w.Dispose();
                    }
                    w = new RandomIndexWriter(Random, dir, iwc);
                    docsLeftInthisSegment = TestUtil.NextInt32(Random, 10, 100);
                }
                Document doc = new Document();
                doc.Add(NewStringField("id", Convert.ToString(docUpto), Field.Store.YES));
                w.AddDocument(doc);
                docUpto++;
                docsLeftInthisSegment--;
            }

            if (VERBOSE)
            {
                Console.WriteLine("\nTEST: now delete...");
            }

            // Random delete half the docs:
            ISet <int?> deleted = new JCG.HashSet <int?>();

            while (deleted.Count < NUM_DOCS / 2)
            {
                int?toDelete = Random.Next(NUM_DOCS);
                if (!deleted.Contains(toDelete))
                {
                    deleted.Add(toDelete);
                    w.DeleteDocuments(new Term("id", Convert.ToString(toDelete)));
                    if (Random.Next(17) == 6)
                    {
                        IndexReader r = w.GetReader();
                        Assert.AreEqual(NUM_DOCS - deleted.Count, r.NumDocs);
                        r.Dispose();
                    }
                }
            }

            w.Dispose();
            dir.Dispose();
        }
Exemplo n.º 29
0
        public virtual void TestGetChildren()
        {
            Directory dir = NewDirectory();
            var       taxoWriter = new DirectoryTaxonomyWriter(dir);
            int       numCategories = AtLeast(10);
            int       numA = 0, numB = 0;
            Random    random = Random;

            // add the two categories for which we'll also add children (so asserts are simpler)
            taxoWriter.AddCategory(new FacetLabel("a"));
            taxoWriter.AddCategory(new FacetLabel("b"));
            for (int i = 0; i < numCategories; i++)
            {
                if (random.NextBoolean())
                {
                    taxoWriter.AddCategory(new FacetLabel("a", Convert.ToString(i, CultureInfo.InvariantCulture)));
                    ++numA;
                }
                else
                {
                    taxoWriter.AddCategory(new FacetLabel("b", Convert.ToString(i, CultureInfo.InvariantCulture)));
                    ++numB;
                }
            }
            // add category with no children
            taxoWriter.AddCategory(new FacetLabel("c"));
            taxoWriter.Dispose();

            var taxoReader = new DirectoryTaxonomyReader(dir);

            // non existing category
            TaxonomyReader.ChildrenEnumerator it = taxoReader.GetChildren(taxoReader.GetOrdinal(new FacetLabel("invalid")));

            Assert.AreEqual(false, it.MoveNext());

            // a category with no children
            it = taxoReader.GetChildren(taxoReader.GetOrdinal(new FacetLabel("c")));
            Assert.AreEqual(false, it.MoveNext());

            // arbitrary negative ordinal
            it = taxoReader.GetChildren(-2);
            Assert.AreEqual(false, it.MoveNext());

            // root's children
            var roots = new JCG.HashSet <string> {
                "a", "b", "c"
            };

            it = taxoReader.GetChildren(TaxonomyReader.ROOT_ORDINAL);
            while (roots.Count > 0)
            {
                it.MoveNext();
                FacetLabel root = taxoReader.GetPath(it.Current);
                Assert.AreEqual(1, root.Length);
                Assert.IsTrue(roots.Remove(root.Components[0]));
            }
            Assert.AreEqual(false, it.MoveNext());

            for (int i = 0; i < 2; i++)
            {
                FacetLabel cp      = i == 0 ? new FacetLabel("a") : new FacetLabel("b");
                int        ordinal = taxoReader.GetOrdinal(cp);
                it = taxoReader.GetChildren(ordinal);
                int numChildren = 0;
                int child;
                while (it.MoveNext())
                {
                    child = it.Current;
                    FacetLabel path = taxoReader.GetPath(child);
                    Assert.AreEqual(2, path.Length);
                    Assert.AreEqual(path.Components[0], i == 0 ? "a" : "b");
                    ++numChildren;
                }
                int expected = i == 0 ? numA : numB;
                Assert.AreEqual(expected, numChildren, "invalid num children");
            }
            taxoReader.Dispose();

            dir.Dispose();
        }
Exemplo n.º 30
0
        private IndexContext CreateIndexContext(bool multipleFacetValuesPerDocument)
        {
            Random random    = Random;
            int    numDocs   = TestUtil.NextInt32(random, 138, 1145) * RANDOM_MULTIPLIER;
            int    numGroups = TestUtil.NextInt32(random, 1, numDocs / 4);
            int    numFacets = TestUtil.NextInt32(random, 1, numDocs / 6);

            if (VERBOSE)
            {
                Console.WriteLine("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
            }

            List <string> groups = new List <string>();

            for (int i = 0; i < numGroups; i++)
            {
                groups.Add(GenerateRandomNonEmptyString());
            }
            List <string> facetValues = new List <string>();

            for (int i = 0; i < numFacets; i++)
            {
                facetValues.Add(GenerateRandomNonEmptyString());
            }
            string[] contentBrs = new string[TestUtil.NextInt32(random, 2, 20)];
            if (VERBOSE)
            {
                Console.WriteLine("TEST: create fake content");
            }
            for (int contentIDX = 0; contentIDX < contentBrs.Length; contentIDX++)
            {
                contentBrs[contentIDX] = GenerateRandomNonEmptyString();
                if (VERBOSE)
                {
                    Console.WriteLine("  content=" + contentBrs[contentIDX]);
                }
            }

            Directory         dir    = NewDirectory();
            RandomIndexWriter writer = new RandomIndexWriter(
                random,
                dir,
                NewIndexWriterConfig(
                    TEST_VERSION_CURRENT,
                    new MockAnalyzer(random)
                    )
                );
            bool canUseDV = !"Lucene3x".Equals(writer.IndexWriter.Config.Codec.Name, StringComparison.Ordinal);
            bool useDv    = canUseDV && !multipleFacetValuesPerDocument && random.nextBoolean();

            Document doc               = new Document();
            Document docNoGroup        = new Document();
            Document docNoFacet        = new Document();
            Document docNoGroupNoFacet = new Document();
            Field    group             = NewStringField("group", "", Field.Store.NO);
            Field    groupDc           = new SortedDocValuesField("group_dv", new BytesRef());

            if (useDv)
            {
                doc.Add(groupDc);
                docNoFacet.Add(groupDc);
            }
            doc.Add(group);
            docNoFacet.Add(group);
            Field[] facetFields;
            if (useDv)
            {
                Debug.Assert(!multipleFacetValuesPerDocument);
                facetFields    = new Field[2];
                facetFields[0] = NewStringField("facet", "", Field.Store.NO);
                doc.Add(facetFields[0]);
                docNoGroup.Add(facetFields[0]);
                facetFields[1] = new SortedDocValuesField("facet_dv", new BytesRef());
                doc.Add(facetFields[1]);
                docNoGroup.Add(facetFields[1]);
            }
            else
            {
                facetFields = multipleFacetValuesPerDocument ? new Field[2 + random.nextInt(6)] : new Field[1];
                for (int i = 0; i < facetFields.Length; i++)
                {
                    facetFields[i] = NewStringField("facet", "", Field.Store.NO);
                    doc.Add(facetFields[i]);
                    docNoGroup.Add(facetFields[i]);
                }
            }
            Field content = NewStringField("content", "", Field.Store.NO);

            doc.Add(content);
            docNoGroup.Add(content);
            docNoFacet.Add(content);
            docNoGroupNoFacet.Add(content);

            ISet <string> uniqueFacetValues = new JCG.SortedSet <string>(new ComparerAnonymousHelper1());

            // LUCENENET NOTE: Need JCG.Dictionary here because of null keys
            IDictionary <string, JCG.Dictionary <string, ISet <string> > > searchTermToFacetToGroups = new Dictionary <string, JCG.Dictionary <string, ISet <string> > >();
            int facetWithMostGroups = 0;

            for (int i = 0; i < numDocs; i++)
            {
                string groupValue;
                if (random.nextInt(24) == 17)
                {
                    // So we test the "doc doesn't have the group'd
                    // field" case:
                    if (useDv)
                    {
                        groupValue = "";
                    }
                    else
                    {
                        groupValue = null;
                    }
                }
                else
                {
                    groupValue = groups[random.nextInt(groups.size())];
                }

                string contentStr = contentBrs[random.nextInt(contentBrs.Length)];
                if (!searchTermToFacetToGroups.TryGetValue(contentStr, out JCG.Dictionary <string, ISet <string> > facetToGroups))
                {
                    searchTermToFacetToGroups[contentStr] = facetToGroups = new JCG.Dictionary <string, ISet <string> >();
                }

                List <string> facetVals = new List <string>();
                if (useDv || random.nextInt(24) != 18)
                {
                    if (useDv)
                    {
                        string facetValue = facetValues[random.nextInt(facetValues.size())];
                        uniqueFacetValues.Add(facetValue);
                        if (!facetToGroups.TryGetValue(facetValue, out ISet <string> groupsInFacet))
                        {
                            facetToGroups[facetValue] = groupsInFacet = new JCG.HashSet <string>();
                        }
                        groupsInFacet.add(groupValue);
                        if (groupsInFacet.size() > facetWithMostGroups)
                        {
                            facetWithMostGroups = groupsInFacet.size();
                        }
                        facetFields[0].SetStringValue(facetValue);
                        facetFields[1].SetBytesValue(new BytesRef(facetValue));
                        facetVals.Add(facetValue);
                    }
                    else
                    {
                        foreach (Field facetField in facetFields)
                        {
                            string facetValue = facetValues[random.nextInt(facetValues.size())];
                            uniqueFacetValues.Add(facetValue);
                            if (!facetToGroups.TryGetValue(facetValue, out ISet <string> groupsInFacet))
                            {
                                facetToGroups[facetValue] = groupsInFacet = new JCG.HashSet <string>();
                            }
                            groupsInFacet.add(groupValue);
                            if (groupsInFacet.size() > facetWithMostGroups)
                            {
                                facetWithMostGroups = groupsInFacet.size();
                            }
                            facetField.SetStringValue(facetValue);
                            facetVals.Add(facetValue);
                        }
                    }
                }
                else
                {
                    uniqueFacetValues.Add(null);
                    if (!facetToGroups.TryGetValue(null, out ISet <string> groupsInFacet))
                    {
                        facetToGroups[null] = groupsInFacet = new JCG.HashSet <string>();
                    }
                    groupsInFacet.add(groupValue);
                    if (groupsInFacet.size() > facetWithMostGroups)
                    {
                        facetWithMostGroups = groupsInFacet.size();
                    }
                }

                if (VERBOSE)
                {
                    Console.WriteLine("  doc content=" + contentStr + " group=" + (groupValue == null ? "null" : groupValue) + " facetVals=" + Collections.ToString(facetVals));
                }

                if (groupValue != null)
                {
                    if (useDv)
                    {
                        groupDc.SetBytesValue(new BytesRef(groupValue));
                    }
                    group.SetStringValue(groupValue);
                }
                else if (useDv)
                {
                    // DV cannot have missing values:
                    groupDc.SetBytesValue(new BytesRef());
                }
                content.SetStringValue(contentStr);
                if (groupValue == null && !facetVals.Any())
                {
                    writer.AddDocument(docNoGroupNoFacet);
                }
                else if (!facetVals.Any())
                {
                    writer.AddDocument(docNoFacet);
                }
                else if (groupValue == null)
                {
                    writer.AddDocument(docNoGroup);
                }
                else
                {
                    writer.AddDocument(doc);
                }
            }

            DirectoryReader reader = writer.GetReader();

            writer.Dispose();

            return(new IndexContext(searchTermToFacetToGroups, reader, numDocs, dir, facetWithMostGroups, numGroups, contentBrs, uniqueFacetValues, useDv));
        }