Exemple #1
0
        private void CreateRandomIndexes(int maxSegments)
        {
            dir     = NewDirectory();
            numDocs = AtLeast(150);
            int           numTerms    = TestUtil.NextInt32(Random, 1, numDocs / 5);
            ISet <string> randomTerms = new JCG.HashSet <string>();

            while (randomTerms.size() < numTerms)
            {
                randomTerms.add(TestUtil.RandomSimpleString(Random));
            }
            terms = new JCG.List <string>(randomTerms);
            long seed             = Random.NextInt64();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new J2N.Randomizer(seed)));

            iwc.SetMergePolicy(TestSortingMergePolicy.NewSortingMergePolicy(sort));
            iw = new RandomIndexWriter(new J2N.Randomizer(seed), dir, iwc);
            for (int i = 0; i < numDocs; ++i)
            {
                Document doc = RandomDocument();
                iw.AddDocument(doc);
                if (i == numDocs / 2 || (i != numDocs - 1 && Random.nextInt(8) == 0))
                {
                    iw.Commit();
                }
                if (Random.nextInt(15) == 0)
                {
                    string term = RandomPicks.RandomFrom(Random, terms);
                    iw.DeleteDocuments(new Term("s", term));
                }
            }
            reader = iw.GetReader();
        }
Exemple #2
0
        public virtual void runTestQuery(SpatialMatchConcern concern, SpatialTestQuery q)
        {
            String msg = q.toString(); //"Query: " + q.args.toString(ctx);
            SearchResults got = executeQuery(makeQuery(q), Math.Max(100, q.ids.size() + 1));
            if (storeShape && got.numFound > 0)
            {
                //check stored value is there
                assertNotNull(got.results[0].document.Get(strategy.FieldName));
            }
            if (concern.orderIsImportant)
            {
                IEnumerator<String> ids = q.ids.GetEnumerator();
                foreach (SearchResult r in got.results)
                {
                    String id = r.document.Get("id");
                    if (!ids.MoveNext())
                    {
                        fail(msg + " :: Did not get enough results.  Expect" + q.ids + ", got: " + got.toDebugString());
                    }
                    assertEquals("out of order: " + msg, ids.Current, id);
                }

                if (ids.MoveNext())
                {
                    fail(msg + " :: expect more results then we got: " + ids.Current);
                }
            }
            else
            {
                // We are looking at how the results overlap
                if (concern.resultsAreSuperset)
                {
                    ISet<string> found = new JCG.HashSet<string>();
                    foreach (SearchResult r in got.results)
                    {
                        found.add(r.document.Get("id"));
                    }
                    foreach (String s in q.ids)
                    {
                        if (!found.contains(s))
                        {
                            fail("Results are mising id: " + s + " :: " + found);
                        }
                    }
                }
                else
                {
                    List<string> found = new List<string>();
                    foreach (SearchResult r in got.results)
                    {
                        found.Add(r.document.Get("id"));
                    }

                    // sort both so that the order is not important
                    CollectionUtil.TimSort(q.ids);
                    CollectionUtil.TimSort(found);
                    assertEquals(msg, q.ids.toString(), found.toString());
                }
            }
        }
Exemple #3
0
        public void TestWithContexts()
        {
            Directory         dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            iwc.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc);
            KeyValuePair <List <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), true, true);
            IDictionary <string, Document> docs = res.Value;
            List <string> invalidDocTerms       = res.Key;

            foreach (Document doc in docs.Values)
            {
                writer.AddDocument(doc);
            }
            writer.Commit();
            writer.Dispose();
            IndexReader    ir            = DirectoryReader.Open(dir);
            IDictionary    dictionary    = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME, CONTEXT_FIELD_NAME);
            IInputIterator inputIterator = dictionary.GetEntryIterator();
            BytesRef       f;

            while ((f = inputIterator.Next()) != null)
            {
                string   field = f.Utf8ToString();
                Document doc   = docs[field];
                docs.Remove(field);
                //Document doc = docs.remove(f.utf8ToString());
                assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME))));
                IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME);
                assertEquals(inputIterator.Weight, (weightField != null) ? weightField.GetInt64ValueOrDefault() : 0);
                assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue()));
                ISet <BytesRef>        oriCtxs    = new JCG.HashSet <BytesRef>();
                IEnumerable <BytesRef> contextSet = inputIterator.Contexts;
                foreach (IIndexableField ctxf in doc.GetFields(CONTEXT_FIELD_NAME))
                {
                    oriCtxs.add(ctxf.GetBinaryValue());
                }
                assertEquals(oriCtxs.size(), contextSet.Count());
            }

            foreach (string invalidTerm in invalidDocTerms)
            {
                var invalid = docs[invalidTerm];
                docs.Remove(invalidTerm);
                assertNotNull(invalid);
            }
            assertTrue(!docs.Any());

            ir.Dispose();
            dir.Dispose();
        }
Exemple #4
0
        public void TestMultiThreaded()
        {
            FileInfo    file    = new FileInfo(Path.Combine(getWorkDir().FullName, "one-line"));
            PerfRunData runData = createPerfRunData(file, false, typeof(ThreadingDocMaker).AssemblyQualifiedName);

            ThreadJob[] threads = new ThreadJob[10];
            using (WriteLineDocTask wldt = new WriteLineDocTask(runData))
            {
                for (int i = 0; i < threads.Length; i++)
                {
                    threads[i] = new ThreadAnonymousHelper("t" + i, wldt);
                }

                foreach (ThreadJob t in threads)
                {
                    t.Start();
                }
                foreach (ThreadJob t in threads)
                {
                    t.Join();
                }
            } // wldt.Dispose();

            ISet <String> ids = new JCG.HashSet <string>();
            TextReader    br  = new StreamReader(new FileStream(file.FullName, FileMode.Open, FileAccess.Read, FileShare.None), Encoding.UTF8);

            try
            {
                String line = br.ReadLine();
                assertHeaderLine(line); // header line is written once, no matter how many threads there are
                for (int i = 0; i < threads.Length; i++)
                {
                    line = br.ReadLine();
                    assertNotNull($"line for index {i} is missing", line); // LUCENENET specific - ensure the line is there before splitting
                    String[] parts = line.Split(WriteLineDocTask.SEP).TrimEnd();
                    assertEquals(line, 3, parts.Length);
                    // check that all thread names written are the same in the same line
                    String tname = parts[0].Substring(parts[0].IndexOf('_'));
                    ids.add(tname);
                    assertEquals(tname, parts[1].Substring(parts[1].IndexOf('_')));
                    assertEquals(tname, parts[2].Substring(parts[2].IndexOf('_')));
                }
                // only threads.length lines should exist
                assertNull(br.ReadLine());
                assertEquals(threads.Length, ids.size());
            }
            finally
            {
                br.Dispose();
            }
        }
Exemple #5
0
        public void TestWithContext()
        {
            Directory         dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            iwc.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter writer            = new RandomIndexWriter(Random, dir, iwc);
            IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100));

            foreach (Document doc in docs.Values)
            {
                writer.AddDocument(doc);
            }
            writer.Commit();
            writer.Dispose();

            IndexReader ir = DirectoryReader.Open(dir);

            ValueSource[]    toAdd         = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2), new Int64FieldSource(WEIGHT_FIELD_NAME_3) };
            IDictionary      dictionary    = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd), PAYLOAD_FIELD_NAME, CONTEXTS_FIELD_NAME);
            IInputEnumerator inputIterator = dictionary.GetEntryEnumerator();

            while (inputIterator.MoveNext())
            {
                string   field = inputIterator.Current.Utf8ToString();
                Document doc   = docs[field];
                docs.Remove(field);
                long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault();
                long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault();
                long w3 = doc.GetField(WEIGHT_FIELD_NAME_3).GetInt64ValueOrDefault();
                assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME))));
                assertEquals(inputIterator.Weight, (w1 + w2 + w3));
                assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue()));

                // LUCENENET NOTE: This test was once failing because we used SCG.HashSet<T> whose
                // Equals() implementation does not check for set equality. As a result SortedInputEnumerator
                // had been modified to reverse the results to get the test to pass. However, using JCG.HashSet<T>
                // ensures that set equality (that is equality that doesn't care about order of items) is respected.
                // SortedInputEnumerator has also had the specific sorting removed.
                ISet <BytesRef> originalCtxs = new JCG.HashSet <BytesRef>();
                foreach (IIndexableField ctxf in doc.GetFields(CONTEXTS_FIELD_NAME))
                {
                    originalCtxs.add(ctxf.GetBinaryValue());
                }
                assertEquals(originalCtxs, inputIterator.Contexts);
            }
            assertTrue(docs.Count == 0);
            ir.Dispose();
            dir.Dispose();
        }
Exemple #6
0
        public void TestDefaultFilter()
        {
            DuplicateFilter df      = new DuplicateFilter(KEY_FIELD);
            ISet <string>   results = new JCG.HashSet <string>();

            ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs;

            foreach (ScoreDoc hit in hits)
            {
                Document d   = searcher.Doc(hit.Doc);
                string   url = d.Get(KEY_FIELD);
                assertFalse("No duplicate urls should be returned", results.contains(url));
                results.add(url);
            }
        }
        /**
         * Makes a bunch of single-char tokens (the max # unique terms will at most be 26).
         * puts the # unique terms into expected, to be checked against the norm.
         */
        private string AddValue()
        {
            StringBuilder sb    = new StringBuilder();
            ISet <string> terms = new JCG.HashSet <string>();
            int           num   = TestUtil.NextInt32(Random, 0, 255);

            for (int i = 0; i < num; i++)
            {
                sb.append(' ');
                char term = (char)TestUtil.NextInt32(Random, 'a', 'z');
                sb.append(term);
                terms.add("" + term);
            }
            expected.Add(terms.size());
            return(sb.toString());
        }
Exemple #8
0
        protected virtual void AssertOperation(IDictionary<String, IShape> indexedDocs,
                                       SpatialOperation operation, IShape queryShape)
        {
            //Generate truth via brute force
            ISet<string> expectedIds = new JCG.HashSet<string>();
            foreach (var stringShapeEntry in indexedDocs)
            {
                if (operation.Evaluate(stringShapeEntry.Value, queryShape))
                    expectedIds.add(stringShapeEntry.Key);
            }

            SpatialTestQuery testQuery = new SpatialTestQuery();
            testQuery.args = new SpatialArgs(operation, queryShape);
            testQuery.ids = new List<string>(expectedIds);
            runTestQuery(SpatialMatchConcern.FILTER, testQuery);
        }
Exemple #9
0
        /**
         * read a set of queries from a resource file
         */
        private ISet <string> ReadQueries(string resource)
        {
            ISet <string> queries = new JCG.HashSet <string>();
            Stream        stream  = GetType().getResourceAsStream(resource);
            TextReader    reader  = new StreamReader(stream, Encoding.UTF8);
            String        line    = null;

            while ((line = reader.ReadLine()) != null)
            {
                line = line.Trim();
                if (line.Length > 0 && !line.StartsWith("#", StringComparison.Ordinal) && !line.StartsWith("//", StringComparison.Ordinal))
                {
                    queries.add(line);
                }
            }
            return(queries);
        }
Exemple #10
0
        public void TestWithContext()
        {
            Directory         dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            iwc.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter writer            = new RandomIndexWriter(Random, dir, iwc);
            IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100));

            foreach (Document doc in docs.Values)
            {
                writer.AddDocument(doc);
            }
            writer.Commit();
            writer.Dispose();

            IndexReader ir = DirectoryReader.Open(dir);

            ValueSource[]  toAdd         = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2), new Int64FieldSource(WEIGHT_FIELD_NAME_3) };
            IDictionary    dictionary    = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd), PAYLOAD_FIELD_NAME, CONTEXTS_FIELD_NAME);
            IInputIterator inputIterator = dictionary.GetEntryIterator();
            BytesRef       f;

            while ((f = inputIterator.Next()) != null)
            {
                string   field = f.Utf8ToString();
                Document doc   = docs[field];
                docs.Remove(field);
                long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault();
                long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault();
                long w3 = doc.GetField(WEIGHT_FIELD_NAME_3).GetInt64ValueOrDefault();
                assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME))));
                assertEquals(inputIterator.Weight, (w1 + w2 + w3));
                assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue()));
                ISet <BytesRef> originalCtxs = new JCG.HashSet <BytesRef>();
                foreach (IIndexableField ctxf in doc.GetFields(CONTEXTS_FIELD_NAME))
                {
                    originalCtxs.add(ctxf.GetBinaryValue());
                }
                assertEquals(originalCtxs, inputIterator.Contexts);
            }
            assertTrue(docs.Count == 0);
            ir.Dispose();
            dir.Dispose();
        }
        private void checkHits(SpatialArgs args, int assertNumFound, int[] assertIds)
        {
            SearchResults got = executeQuery(strategy.MakeQuery(args), 100);

            assertEquals("" + args, assertNumFound, got.numFound);
            if (assertIds != null)
            {
                ISet <int?> gotIds = new JCG.HashSet <int?>();
                foreach (SearchResult result in got.results)
                {
                    gotIds.add(int.Parse(result.document.Get("id"), CultureInfo.InvariantCulture));
                }
                foreach (int assertId in assertIds)
                {
                    assertTrue("has " + assertId, gotIds.contains(assertId));
                }
            }
        }
Exemple #12
0
        public void TestFastFilter()
        {
            DuplicateFilter df = new DuplicateFilter(KEY_FIELD);

            df.ProcessingMode = (ProcessingMode.PM_FAST_INVALIDATION);
            ISet <string> results = new JCG.HashSet <string>();

            ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs;
            assertTrue("Filtered searching should have found some matches", hits.Length > 0);

            foreach (ScoreDoc hit in hits)
            {
                Document d   = searcher.Doc(hit.Doc);
                string   url = d.Get(KEY_FIELD);
                assertFalse("No duplicate urls should be returned", results.contains(url));
                results.add(url);
            }
            assertEquals("Two urls found", 2, results.size());
        }
Exemple #13
0
        public void TestNoFilter()
        {
            ISet <string> results = new JCG.HashSet <string>();

            ScoreDoc[] hits = searcher.Search(tq, null, 1000).ScoreDocs;
            assertTrue("Default searching should have found some matches", hits.Length > 0);
            bool dupsFound = false;

            foreach (ScoreDoc hit in hits)
            {
                Document d   = searcher.Doc(hit.Doc);
                string   url = d.Get(KEY_FIELD);
                if (!dupsFound)
                {
                    dupsFound = results.contains(url);
                }
                results.add(url);
            }
            assertTrue("Default searching should have found duplicate urls", dupsFound);
        }
            public object Create(Random random)
            {
                NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
                // we can't add duplicate keys, or NormalizeCharMap gets angry
                ISet <string> keys = new JCG.HashSet <string>();
                int           num  = random.nextInt(5);

                //System.out.println("NormalizeCharMap=");
                for (int i = 0; i < num; i++)
                {
                    string key = TestUtil.RandomSimpleString(random);
                    if (!keys.contains(key) && key.Length > 0)
                    {
                        string value = TestUtil.RandomSimpleString(random);
                        builder.Add(key, value);
                        keys.add(key);
                        //System.out.println("mapping: '" + key + "' => '" + value + "'");
                    }
                }
                return(builder.Build());
            }
Exemple #15
0
        private void _CheckHits(bool bbox, IPoint pt, double distKM, int assertNumFound, params int[] assertIds)
        {
            SpatialOperation op      = SpatialOperation.Intersects;
            double           distDEG = DistanceUtils.Dist2Degrees(distKM, DistanceUtils.EARTH_MEAN_RADIUS_KM);
            IShape           shape   = ctx.MakeCircle(pt, distDEG);

            if (bbox)
            {
                shape = shape.BoundingBox;
            }

            SpatialArgs args = new SpatialArgs(op, shape);
            //args.setDistPrecision(0.025);
            Query query;

            if (Random.nextBoolean())
            {
                query = strategy.MakeQuery(args);
            }
            else
            {
                query = new FilteredQuery(new MatchAllDocsQuery(), strategy.MakeFilter(args));
            }
            SearchResults results = executeQuery(query, 100);

            assertEquals("" + shape, assertNumFound, results.numFound);
            if (assertIds != null)
            {
                ISet <int?> resultIds = new JCG.HashSet <int?>();
                foreach (SearchResult result in results.results)
                {
                    resultIds.add(int.Parse(result.document.Get("id"), CultureInfo.InvariantCulture));
                }
                foreach (int assertId in assertIds)
                {
                    assertTrue("has " + assertId, resultIds.contains(assertId));
                }
            }
        }
        private GroupedFacetResult CreateExpectedFacetResult(string searchTerm, IndexContext context, int offset, int limit, int minCount, bool orderByCount, string facetPrefix)
        {
            JCG.Dictionary <string, ISet <string> > facetGroups;
            if (!context.searchTermToFacetGroups.TryGetValue(searchTerm, out facetGroups))
            {
                facetGroups = new JCG.Dictionary <string, ISet <string> >();
            }

            int           totalCount     = 0;
            int           totalMissCount = 0;
            ISet <string> facetValues;

            if (facetPrefix != null)
            {
                facetValues = new JCG.HashSet <string>();
                foreach (string facetValue in context.facetValues)
                {
                    if (facetValue != null && facetValue.StartsWith(facetPrefix, StringComparison.Ordinal))
                    {
                        facetValues.add(facetValue);
                    }
                }
            }
            else
            {
                facetValues = context.facetValues;
            }

            List <TermGroupFacetCollector.FacetEntry> entries = new List <TermGroupFacetCollector.FacetEntry>(facetGroups.size());

            // also includes facets with count 0
            foreach (string facetValue in facetValues)
            {
                if (facetValue == null)
                {
                    continue;
                }

                int count = facetGroups.TryGetValue(facetValue, out ISet <string> groups) && groups != null?groups.size() : 0;

                if (count >= minCount)
                {
                    entries.Add(new TermGroupFacetCollector.FacetEntry(new BytesRef(facetValue), count));
                }
                totalCount += count;
            }

            // Only include null count when no facet prefix is specified
            if (facetPrefix == null)
            {
                if (facetGroups.TryGetValue(null, out ISet <string> groups) && groups != null)
                {
                    totalMissCount = groups.size();
                }
            }

            entries.Sort(Comparer <TermGroupFacetCollector.FacetEntry> .Create((a, b) => {
                if (orderByCount)
                {
                    int cmp = b.Count - a.Count;
                    if (cmp != 0)
                    {
                        return(cmp);
                    }
                }
                return(a.Value.CompareTo(b.Value));
            }));

            int endOffset = offset + limit;
            IList <TermGroupFacetCollector.FacetEntry> entriesResult;

            if (offset >= entries.size())
            {
                entriesResult = Collections.EmptyList <TermGroupFacetCollector.FacetEntry>();
            }
            else if (endOffset >= entries.size())
            {
                entriesResult = entries.GetRange(offset, entries.size() - offset);
            }
            else
            {
                entriesResult = entries.GetRange(offset, endOffset - offset);
            }
            return(new GroupedFacetResult(totalCount, totalMissCount, entriesResult));
        }
        public void TestRandom()
        {
            string[]      terms = new string[TestUtil.NextInt32(Random, 2, 10)];
            ISet <string> seen  = new JCG.HashSet <string>();

            while (seen.size() < terms.Length)
            {
                string token = TestUtil.RandomSimpleString(Random, 1, 5);
                if (!seen.contains(token))
                {
                    terms[seen.size()] = token;
                    seen.add(token);
                }
            }

            Analyzer a = new MockAnalyzer(Random);

            int  numDocs   = AtLeast(10);
            long totTokens = 0;

            string[][] docs = new string[numDocs][];
            for (int i = 0; i < numDocs; i++)
            {
                docs[i] = new string[AtLeast(100)];
                if (Verbose)
                {
                    Console.Write("  doc " + i + ":");
                }
                for (int j = 0; j < docs[i].Length; j++)
                {
                    docs[i][j] = GetZipfToken(terms);
                    if (Verbose)
                    {
                        Console.Write(" " + docs[i][j]);
                    }
                }
                if (Verbose)
                {
                    Console.WriteLine();
                }
                totTokens += docs[i].Length;
            }

            int grams = TestUtil.NextInt32(Random, 1, 4);

            if (Verbose)
            {
                Console.WriteLine("TEST: " + terms.Length + " terms; " + numDocs + " docs; " + grams + " grams");
            }

            // Build suggester model:
            FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte)0x20);

            sug.Build(new TestRandomInputIterator(this, docs));

            // Build inefficient but hopefully correct model:
            List <IDictionary <string, int?> > gramCounts = new List <IDictionary <string, int?> >(grams);

            for (int gram = 0; gram < grams; gram++)
            {
                if (Verbose)
                {
                    Console.WriteLine("TEST: build model for gram=" + gram);
                }
                IDictionary <string, int?> model = new JCG.Dictionary <string, int?>();
                gramCounts.Add(model);
                foreach (string[] doc in docs)
                {
                    for (int i = 0; i < doc.Length - gram; i++)
                    {
                        StringBuilder b = new StringBuilder();
                        for (int j = i; j <= i + gram; j++)
                        {
                            if (j > i)
                            {
                                b.append(' ');
                            }
                            b.append(doc[j]);
                        }
                        string token = b.toString();
                        if (!model.TryGetValue(token, out int?curCount) || curCount == null)
                        {
                            model.Put(token, 1);
                        }
                        else
                        {
                            model.Put(token, 1 + curCount);
                        }
                        if (Verbose)
                        {
                            Console.WriteLine("  add '" + token + "' -> count=" + (model.TryGetValue(token, out int?count) ? (count.HasValue ? count.ToString() : "null") : ""));
                        }
                    }
                }
            }

            int lookups = AtLeast(100);

            for (int iter = 0; iter < lookups; iter++)
            {
                string[] tokens = new string[TestUtil.NextInt32(Random, 1, 5)];
                for (int i = 0; i < tokens.Length; i++)
                {
                    tokens[i] = GetZipfToken(terms);
                }

                // Maybe trim last token; be sure not to create the
                // empty string:
                int trimStart;
                if (tokens.Length == 1)
                {
                    trimStart = 1;
                }
                else
                {
                    trimStart = 0;
                }
                int trimAt = TestUtil.NextInt32(Random, trimStart, tokens[tokens.Length - 1].Length);
                tokens[tokens.Length - 1] = tokens[tokens.Length - 1].Substring(0, trimAt - 0);

                int           num = TestUtil.NextInt32(Random, 1, 100);
                StringBuilder b   = new StringBuilder();
                foreach (string token in tokens)
                {
                    b.append(' ');
                    b.append(token);
                }
                string query = b.toString();
                query = query.Substring(1);

                if (Verbose)
                {
                    Console.WriteLine("\nTEST: iter=" + iter + " query='" + query + "' num=" + num);
                }

                // Expected:
                List <Lookup.LookupResult> expected = new List <Lookup.LookupResult>();
                double backoff = 1.0;
                seen = new JCG.HashSet <string>();

                if (Verbose)
                {
                    Console.WriteLine("  compute expected");
                }
                for (int i = grams - 1; i >= 0; i--)
                {
                    if (Verbose)
                    {
                        Console.WriteLine("    grams=" + i);
                    }

                    if (tokens.Length < i + 1)
                    {
                        // Don't have enough tokens to use this model
                        if (Verbose)
                        {
                            Console.WriteLine("      skip");
                        }
                        continue;
                    }

                    if (i == 0 && tokens[tokens.Length - 1].Length == 0)
                    {
                        // Never suggest unigrams from empty string:
                        if (Verbose)
                        {
                            Console.WriteLine("      skip unigram priors only");
                        }
                        continue;
                    }

                    // Build up "context" ngram:
                    b = new StringBuilder();
                    for (int j = tokens.Length - i - 1; j < tokens.Length - 1; j++)
                    {
                        b.append(' ');
                        b.append(tokens[j]);
                    }
                    string context = b.toString();
                    if (context.Length > 0)
                    {
                        context = context.Substring(1);
                    }
                    if (Verbose)
                    {
                        Console.WriteLine("      context='" + context + "'");
                    }
                    long contextCount;
                    if (context.Length == 0)
                    {
                        contextCount = totTokens;
                    }
                    else
                    {
                        //int? count = gramCounts.get(i - 1).get(context);
                        var gramCount = gramCounts[i - 1];
                        if (!gramCount.TryGetValue(context, out int?count) || count == null)
                        {
                            // We never saw this context:
                            backoff *= FreeTextSuggester.ALPHA;
                            if (Verbose)
                            {
                                Console.WriteLine("      skip: never saw context");
                            }
                            continue;
                        }
                        contextCount = count.GetValueOrDefault();
                    }
                    if (Verbose)
                    {
                        Console.WriteLine("      contextCount=" + contextCount);
                    }
                    IDictionary <string, int?> model = gramCounts[i];

                    // First pass, gather all predictions for this model:
                    if (Verbose)
                    {
                        Console.WriteLine("      find terms w/ prefix=" + tokens[tokens.Length - 1]);
                    }
                    List <Lookup.LookupResult> tmp = new List <Lookup.LookupResult>();
                    foreach (string term in terms)
                    {
                        if (term.StartsWith(tokens[tokens.Length - 1], StringComparison.Ordinal))
                        {
                            if (Verbose)
                            {
                                Console.WriteLine("        term=" + term);
                            }
                            if (seen.contains(term))
                            {
                                if (Verbose)
                                {
                                    Console.WriteLine("          skip seen");
                                }
                                continue;
                            }
                            string ngram = (context + " " + term).Trim();
                            //Integer count = model.get(ngram);
                            if (model.TryGetValue(ngram, out int?count) && count != null)
                            {
                                // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
                                // return numbers that are greater than long.MaxValue, which results in a negative long number.
                                // This is also the way it is being done in the FreeTextSuggester to work around the issue.
                                Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * ((decimal)backoff * (decimal)count / contextCount)));
                                tmp.Add(lr);
                                if (Verbose)
                                {
                                    Console.WriteLine("      add tmp key='" + lr.Key + "' score=" + lr.Value);
                                }
                            }
                        }
                    }

                    // Second pass, trim to only top N, and fold those
                    // into overall suggestions:
                    tmp.Sort(byScoreThenKey);
                    if (tmp.size() > num)
                    {
                        //tmp.subList(num, tmp.size()).clear();
                        tmp.RemoveRange(num, tmp.size() - num);
                    }
                    foreach (Lookup.LookupResult result in tmp)
                    {
                        string key = result.Key.toString();
                        int    idx = key.LastIndexOf(' ');
                        string lastToken;
                        if (idx != -1)
                        {
                            lastToken = key.Substring(idx + 1);
                        }
                        else
                        {
                            lastToken = key;
                        }
                        if (!seen.contains(lastToken))
                        {
                            seen.add(lastToken);
                            expected.Add(result);
                            if (Verbose)
                            {
                                Console.WriteLine("      keep key='" + result.Key + "' score=" + result.Value);
                            }
                        }
                    }

                    backoff *= FreeTextSuggester.ALPHA;
                }

                expected.Sort(byScoreThenKey);

                if (expected.size() > num)
                {
                    expected.RemoveRange(num, expected.size() - num);
                }

                // Actual:
                IList <Lookup.LookupResult> actual = sug.DoLookup(query, num);

                if (Verbose)
                {
                    Console.WriteLine("  expected: " + expected);
                    Console.WriteLine("    actual: " + actual);
                }

                assertEquals(expected.ToString(), actual.ToString());
            }
        }
Exemple #18
0
        private void CreateRandomIndexes()
        {
            dir1 = NewDirectory();
            dir2 = NewDirectory();
            int           numDocs     = AtLeast(150);
            int           numTerms    = TestUtil.NextInt32(Random, 1, numDocs / 5);
            ISet <string> randomTerms = new JCG.HashSet <string>();

            while (randomTerms.size() < numTerms)
            {
                randomTerms.add(TestUtil.RandomSimpleString(Random));
            }
            terms = new JCG.List <string>(randomTerms);
            long seed = Random.NextInt64();
            IndexWriterConfig iwc1 = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new Random((int)seed)));
            IndexWriterConfig iwc2 = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(new Random((int)seed)));

            iwc2.SetMergePolicy(NewSortingMergePolicy(sort));
            RandomIndexWriter iw1 = new RandomIndexWriter(new Random((int)seed), dir1, iwc1);
            RandomIndexWriter iw2 = new RandomIndexWriter(new Random((int)seed), dir2, iwc2);

            for (int i = 0; i < numDocs; ++i)
            {
                if (Random.nextInt(5) == 0 && i != numDocs - 1)
                {
                    string term = RandomPicks.RandomFrom(Random, terms);
                    iw1.DeleteDocuments(new Term("s", term));
                    iw2.DeleteDocuments(new Term("s", term));
                }
                Document doc = randomDocument();
                iw1.AddDocument(doc);
                iw2.AddDocument(doc);
                if (Random.nextInt(8) == 0)
                {
                    iw1.Commit();
                    iw2.Commit();
                }
            }
            // Make sure we have something to merge
            iw1.Commit();
            iw2.Commit();
            Document doc2 = randomDocument();

            // NOTE: don't use RIW.addDocument directly, since it sometimes commits
            // which may trigger a merge, at which case forceMerge may not do anything.
            // With field updates this is a problem, since the updates can go into the
            // single segment in the index, and threefore the index won't be sorted.
            // This hurts the assumption of the test later on, that the index is sorted
            // by SortingMP.
            iw1.IndexWriter.AddDocument(doc2);
            iw2.IndexWriter.AddDocument(doc2);

            if (DefaultCodecSupportsFieldUpdates)
            {
                // update NDV of docs belonging to one term (covers many documents)
                long   value = Random.NextInt64();
                string term  = RandomPicks.RandomFrom(Random, terms);
                iw1.IndexWriter.UpdateNumericDocValue(new Term("s", term), "ndv", value);
                iw2.IndexWriter.UpdateNumericDocValue(new Term("s", term), "ndv", value);
            }

            iw1.ForceMerge(1);
            iw2.ForceMerge(1);
            iw1.Dispose();
            iw2.Dispose();
            reader       = DirectoryReader.Open(dir1);
            sortedReader = DirectoryReader.Open(dir2);
        }
        public void TestTerms()
        {
            Random random = Random;
            int    num    = AtLeast(10000);

#pragma warning disable 612, 618
            IComparer <BytesRef> comparer = random.nextBoolean() ? BytesRef.UTF8SortedAsUnicodeComparer : BytesRef.UTF8SortedAsUTF16Comparer;
#pragma warning restore 612, 618
            IDictionary <BytesRef, KeyValuePair <long, BytesRef> > sorted = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, BytesRef> >(comparer);
            IDictionary <BytesRef, long> sortedWithoutPayload             = new JCG.SortedDictionary <BytesRef, long>(comparer);
            IDictionary <BytesRef, KeyValuePair <long, ISet <BytesRef> > > sortedWithContext = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, ISet <BytesRef> > >(comparer);
            IDictionary <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > > sortedWithPayloadAndContext = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > >(comparer);
            Input[]         unsorted = new Input[num];
            Input[]         unsortedWithoutPayload        = new Input[num];
            Input[]         unsortedWithContexts          = new Input[num];
            Input[]         unsortedWithPayloadAndContext = new Input[num];
            ISet <BytesRef> ctxs;
            for (int i = 0; i < num; i++)
            {
                BytesRef key2;
                BytesRef payload;
                ctxs = new JCG.HashSet <BytesRef>();
                do
                {
                    key2    = new BytesRef(TestUtil.RandomUnicodeString(random));
                    payload = new BytesRef(TestUtil.RandomUnicodeString(random));
                    for (int j = 0; j < AtLeast(2); j++)
                    {
                        ctxs.add(new BytesRef(TestUtil.RandomUnicodeString(random)));
                    }
                } while (sorted.ContainsKey(key2));
                long value = random.Next();
                sortedWithoutPayload.Put(key2, value);
                sorted.Put(key2, new KeyValuePair <long, BytesRef>(value, payload));
                sortedWithContext.Put(key2, new KeyValuePair <long, ISet <BytesRef> >(value, ctxs));
                sortedWithPayloadAndContext.Put(key2, new KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > >(value, new KeyValuePair <BytesRef, ISet <BytesRef> >(payload, ctxs)));
                unsorted[i] = new Input(key2, value, payload);
                unsortedWithoutPayload[i]        = new Input(key2, value);
                unsortedWithContexts[i]          = new Input(key2, value, ctxs);
                unsortedWithPayloadAndContext[i] = new Input(key2, value, payload, ctxs);
            }

            // test the sorted iterator wrapper with payloads
            IInputIterator wrapper = new SortedInputIterator(new InputArrayIterator(unsorted), comparer);
            IEnumerator <KeyValuePair <BytesRef, KeyValuePair <long, BytesRef> > > expected = sorted.GetEnumerator();
            while (expected.MoveNext())
            {
                KeyValuePair <BytesRef, KeyValuePair <long, BytesRef> > entry = expected.Current;


                assertEquals(entry.Key, wrapper.Next());
                assertEquals(Convert.ToInt64(entry.Value.Key), wrapper.Weight);
                assertEquals(entry.Value.Value, wrapper.Payload);
            }
            assertNull(wrapper.Next());

            // test the sorted iterator wrapper with contexts
            wrapper = new SortedInputIterator(new InputArrayIterator(unsortedWithContexts), comparer);
            IEnumerator <KeyValuePair <BytesRef, KeyValuePair <long, ISet <BytesRef> > > > actualEntries = sortedWithContext.GetEnumerator();
            while (actualEntries.MoveNext())
            {
                KeyValuePair <BytesRef, KeyValuePair <long, ISet <BytesRef> > > entry = actualEntries.Current;
                assertEquals(entry.Key, wrapper.Next());
                assertEquals(Convert.ToInt64(entry.Value.Key), wrapper.Weight);
                ISet <BytesRef> actualCtxs = entry.Value.Value;
                assertEquals(actualCtxs, wrapper.Contexts);
            }
            assertNull(wrapper.Next());

            // test the sorted iterator wrapper with contexts and payload
            wrapper = new SortedInputIterator(new InputArrayIterator(unsortedWithPayloadAndContext), comparer);
            IEnumerator <KeyValuePair <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > > > expectedPayloadContextEntries = sortedWithPayloadAndContext.GetEnumerator();
            while (expectedPayloadContextEntries.MoveNext())
            {
                KeyValuePair <BytesRef, KeyValuePair <long, KeyValuePair <BytesRef, ISet <BytesRef> > > > entry = expectedPayloadContextEntries.Current;
                assertEquals(entry.Key, wrapper.Next());
                assertEquals(Convert.ToInt64(entry.Value.Key), wrapper.Weight);
                ISet <BytesRef> actualCtxs = entry.Value.Value.Value;
                assertEquals(actualCtxs, wrapper.Contexts);
                BytesRef actualPayload = entry.Value.Value.Key;
                assertEquals(actualPayload, wrapper.Payload);
            }
            assertNull(wrapper.Next());

            // test the unsorted iterator wrapper with payloads
            wrapper = new UnsortedInputIterator(new InputArrayIterator(unsorted));
            IDictionary <BytesRef, KeyValuePair <long, BytesRef> > actual = new JCG.SortedDictionary <BytesRef, KeyValuePair <long, BytesRef> >();
            BytesRef key;
            while ((key = wrapper.Next()) != null)
            {
                long     value   = wrapper.Weight;
                BytesRef payload = wrapper.Payload;
                actual.Put(BytesRef.DeepCopyOf(key), new KeyValuePair <long, BytesRef>(value, BytesRef.DeepCopyOf(payload)));
            }
            assertEquals(sorted, actual);

            // test the sorted iterator wrapper without payloads
            IInputIterator wrapperWithoutPayload = new SortedInputIterator(new InputArrayIterator(unsortedWithoutPayload), comparer);
            IEnumerator <KeyValuePair <BytesRef, long> > expectedWithoutPayload = sortedWithoutPayload.GetEnumerator();
            while (expectedWithoutPayload.MoveNext())
            {
                KeyValuePair <BytesRef, long> entry = expectedWithoutPayload.Current;


                assertEquals(entry.Key, wrapperWithoutPayload.Next());
                assertEquals(Convert.ToInt64(entry.Value), wrapperWithoutPayload.Weight);
                assertNull(wrapperWithoutPayload.Payload);
            }
            assertNull(wrapperWithoutPayload.Next());

            // test the unsorted iterator wrapper without payloads
            wrapperWithoutPayload = new UnsortedInputIterator(new InputArrayIterator(unsortedWithoutPayload));
            IDictionary <BytesRef, long> actualWithoutPayload = new JCG.SortedDictionary <BytesRef, long>();
            while ((key = wrapperWithoutPayload.Next()) != null)
            {
                long value = wrapperWithoutPayload.Weight;
                assertNull(wrapperWithoutPayload.Payload);
                actualWithoutPayload.Put(BytesRef.DeepCopyOf(key), value);
            }
            assertEquals(sortedWithoutPayload, actualWithoutPayload);
        }