コード例 #1
0
 public TopNSearcherAnonymousInnerClassHelper(FreeTextSuggester outerInstance, FST <long?> fst, int num, UnknownType size, UnknownType weightComparator, HashSet <BytesRef> seen, BytesRef finalLastToken)
     : base(fst, num, size, weightComparator)
 {
     this.outerInstance  = outerInstance;
     this.seen           = seen;
     this.finalLastToken = finalLastToken;
     scratchBytes        = new BytesRef();
 }
コード例 #2
0
        public void TestNoDupsAcrossGrams()
        {
            IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle(
                new Input("foo bar bar bar bar", 50)
                );
            Analyzer          a   = new MockAnalyzer(Random);
            FreeTextSuggester sug = new FreeTextSuggester(a, a, 2, (byte)0x20);

            sug.Build(new InputArrayIterator(keys));
            assertEquals("foo bar/1.00",
                         ToString(sug.DoLookup("foo b", 10)));
        }
コード例 #3
0
        public void TestTwoEndingHoles()
        {
            // Just deletes "of"
            Analyzer a = new TestEndingHoleAnalyzer();

            IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle(
                new Input("wizard of of oz", 50)
                );
            FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte)0x20);

            sug.Build(new InputArrayIterator(keys));
            assertEquals("",
                         ToString(sug.DoLookup("wizard of of", 10)));
        }
コード例 #4
0
        public void TestUnigrams()
        {
            IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle(
                new Input("foo bar baz blah boo foo bar foo bee", 50)
                );

            Analyzer          a   = new MockAnalyzer(Random);
            FreeTextSuggester sug = new FreeTextSuggester(a, a, 1, (byte)0x20);

            sug.Build(new InputArrayIterator(keys));
            // Sorts first by count, descending, second by term, ascending
            assertEquals("bar/0.22 baz/0.11 bee/0.11 blah/0.11 boo/0.11",
                         ToString(sug.DoLookup("b", 10)));
        }
コード例 #5
0
 public TopNSearcherAnonymousInnerClassHelper(
     FreeTextSuggester outerInstance,
     FST <long?> fst,
     int num,
     int size,
     IComparer <long?> weightComparer,
     IEnumerable <BytesRef> seen,
     BytesRef finalLastToken)
     : base(fst, num, size, weightComparer)
 {
     this.outerInstance  = outerInstance;
     this.seen           = seen;
     this.finalLastToken = finalLastToken;
     scratchBytes        = new BytesRef();
 }
コード例 #6
0
 public TopNSearcherAnonymousClass(
     FreeTextSuggester outerInstance,
     FST <Int64> fst,
     int num,
     int size,
     IComparer <Int64> weightComparer,
     ISet <BytesRef> seen,
     BytesRef finalLastToken)
     : base(fst, num, size, weightComparer)
 {
     this.outerInstance  = outerInstance;
     this.seen           = seen;
     this.finalLastToken = finalLastToken;
     scratchBytes        = new BytesRef();
 }
コード例 #7
0
        public void TestBasic()
        {
            IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle(
                new Input("foo bar baz blah", 50),
                new Input("boo foo bar foo bee", 20)
                );

            Analyzer          a   = new MockAnalyzer(Random);
            FreeTextSuggester sug = new FreeTextSuggester(a, a, 2, (byte)0x20);

            sug.Build(new InputArrayIterator(keys));
            assertEquals(2, sug.Count);

            for (int i = 0; i < 2; i++)
            {
                // Uses bigram model and unigram backoff:
                assertEquals("foo bar/0.67 foo bee/0.33 baz/0.04 blah/0.04 boo/0.04",
                             ToString(sug.DoLookup("foo b", 10)));

                // Uses only bigram model:
                assertEquals("foo bar/0.67 foo bee/0.33",
                             ToString(sug.DoLookup("foo ", 10)));

                // Uses only unigram model:
                assertEquals("foo/0.33",
                             ToString(sug.DoLookup("foo", 10)));

                // Uses only unigram model:
                assertEquals("bar/0.22 baz/0.11 bee/0.11 blah/0.11 boo/0.11",
                             ToString(sug.DoLookup("b", 10)));

                // Try again after save/load:
                DirectoryInfo tmpDir = CreateTempDir("FreeTextSuggesterTest");
                //tmpDir.Create();

                FileInfo path = new FileInfo(Path.Combine(tmpDir.FullName, "suggester"));

                using (Stream os = new FileStream(path.FullName, FileMode.Create, FileAccess.Write))
                    sug.Store(os);

                using (Stream @is = new FileStream(path.FullName, FileMode.Open, FileAccess.Read))
                {
                    sug = new FreeTextSuggester(a, a, 2, (byte)0x20);
                    sug.Load(@is);
                }
                assertEquals(2, sug.Count);
            }
        }
コード例 #8
0
        public void TestIllegalByteDuringBuild()
        {
            // Default separator is INFORMATION SEPARATOR TWO
            // (0x1e), so no input token is allowed to contain it
            IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle(
                new Input("foo\u001ebar baz", 50)
                );
            FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(Random));

            try
            {
                sug.Build(new InputArrayIterator(keys));
                fail("did not hit expected exception");
            }
            catch (ArgumentException /*iae*/)
            {
                // expected
            }
        }
コード例 #9
0
        public void TestEndingHole()
        {
            // Just deletes "of"
            Analyzer a = new TestEndingHoleAnalyzer();

            IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle(
                new Input("wizard of oz", 50)
                );
            FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte)0x20);

            sug.Build(new InputArrayIterator(keys));
            assertEquals("wizard _ oz/1.00",
                         ToString(sug.DoLookup("wizard of", 10)));

            // Falls back to unigram model, with backoff 0.4 times
            // prop 0.5:
            assertEquals("oz/0.20",
                         ToString(sug.DoLookup("wizard o", 10)));
        }
コード例 #10
0
        public void TestEmptyString()
        {
            IEnumerable <Input> keys = AnalyzingSuggesterTest.Shuffle(
                new Input("foo bar bar bar bar", 50)
                );
            Analyzer          a   = new MockAnalyzer(Random);
            FreeTextSuggester sug = new FreeTextSuggester(a, a, 2, (byte)0x20);

            sug.Build(new InputArrayIterator(keys));
            try
            {
                sug.DoLookup("", 10);
                fail("did not hit exception");
            }
            catch (ArgumentException /*iae*/)
            {
                // expected
            }
        }
コード例 #11
0
        public void TestWiki()
        {
            LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt", false);

            // Skip header:
            lfd.NextDoc();
            FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(Random));

            sug.Build(new TestWikiInputIterator(this, lfd));
            if (VERBOSE)
            {
                Console.WriteLine(sug.GetSizeInBytes() + " bytes");

                IList <Lookup.LookupResult> results = sug.DoLookup("general r", 10);
                Console.WriteLine("results:");
                foreach (Lookup.LookupResult result in results)
                {
                    Console.WriteLine("  " + result);
                }
            }
        }
コード例 #12
0
 public ComparerAnonymousInnerClassHelper(FreeTextSuggester outerInstance)
 {
     this.outerInstance = outerInstance;
 }
コード例 #13
0
 public AnalyzerWrapperAnonymousInnerClassHelper(FreeTextSuggester outerInstance, ReuseStrategy reuseStrategy, Analyzer other)
     : base(reuseStrategy)
 {
     this.outerInstance = outerInstance;
     this.other         = other;
 }
コード例 #14
0
        public void TestRandom()
        {
            string[]      terms = new string[TestUtil.NextInt32(Random, 2, 10)];
            ISet <string> seen  = new HashSet <string>();

            while (seen.size() < terms.Length)
            {
                string token = TestUtil.RandomSimpleString(Random, 1, 5);
                if (!seen.contains(token))
                {
                    terms[seen.size()] = token;
                    seen.add(token);
                }
            }

            Analyzer a = new MockAnalyzer(Random);

            int  numDocs   = AtLeast(10);
            long totTokens = 0;

            string[][] docs = new string[numDocs][];
            for (int i = 0; i < numDocs; i++)
            {
                docs[i] = new string[AtLeast(100)];
                if (VERBOSE)
                {
                    Console.Write("  doc " + i + ":");
                }
                for (int j = 0; j < docs[i].Length; j++)
                {
                    docs[i][j] = GetZipfToken(terms);
                    if (VERBOSE)
                    {
                        Console.Write(" " + docs[i][j]);
                    }
                }
                if (VERBOSE)
                {
                    Console.WriteLine();
                }
                totTokens += docs[i].Length;
            }

            int grams = TestUtil.NextInt32(Random, 1, 4);

            if (VERBOSE)
            {
                Console.WriteLine("TEST: " + terms.Length + " terms; " + numDocs + " docs; " + grams + " grams");
            }

            // Build suggester model:
            FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte)0x20);

            sug.Build(new TestRandomInputIterator(this, docs));

            // Build inefficient but hopefully correct model:
            List <IDictionary <string, int?> > gramCounts = new List <IDictionary <string, int?> >(grams);

            for (int gram = 0; gram < grams; gram++)
            {
                if (VERBOSE)
                {
                    Console.WriteLine("TEST: build model for gram=" + gram);
                }
                IDictionary <string, int?> model = new HashMap <string, int?>();
                gramCounts.Add(model);
                foreach (string[] doc in docs)
                {
                    for (int i = 0; i < doc.Length - gram; i++)
                    {
                        StringBuilder b = new StringBuilder();
                        for (int j = i; j <= i + gram; j++)
                        {
                            if (j > i)
                            {
                                b.append(' ');
                            }
                            b.append(doc[j]);
                        }
                        string token    = b.toString();
                        int?   curCount = model.ContainsKey(token) ? model[token] : null;
                        if (curCount == null)
                        {
                            model.Put(token, 1);
                        }
                        else
                        {
                            model.Put(token, 1 + curCount);
                        }
                        if (VERBOSE)
                        {
                            Console.WriteLine("  add '" + token + "' -> count=" + (model.ContainsKey(token) ? model[token].ToString() : ""));
                        }
                    }
                }
            }

            int lookups = AtLeast(100);

            for (int iter = 0; iter < lookups; iter++)
            {
                string[] tokens = new string[TestUtil.NextInt32(Random, 1, 5)];
                for (int i = 0; i < tokens.Length; i++)
                {
                    tokens[i] = GetZipfToken(terms);
                }

                // Maybe trim last token; be sure not to create the
                // empty string:
                int trimStart;
                if (tokens.Length == 1)
                {
                    trimStart = 1;
                }
                else
                {
                    trimStart = 0;
                }
                int trimAt = TestUtil.NextInt32(Random, trimStart, tokens[tokens.Length - 1].Length);
                tokens[tokens.Length - 1] = tokens[tokens.Length - 1].Substring(0, trimAt - 0);

                int           num = TestUtil.NextInt32(Random, 1, 100);
                StringBuilder b   = new StringBuilder();
                foreach (string token in tokens)
                {
                    b.append(' ');
                    b.append(token);
                }
                string query = b.toString();
                query = query.Substring(1);

                if (VERBOSE)
                {
                    Console.WriteLine("\nTEST: iter=" + iter + " query='" + query + "' num=" + num);
                }

                // Expected:
                List <Lookup.LookupResult> expected = new List <Lookup.LookupResult>();
                double backoff = 1.0;
                seen = new HashSet <string>();

                if (VERBOSE)
                {
                    Console.WriteLine("  compute expected");
                }
                for (int i = grams - 1; i >= 0; i--)
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine("    grams=" + i);
                    }

                    if (tokens.Length < i + 1)
                    {
                        // Don't have enough tokens to use this model
                        if (VERBOSE)
                        {
                            Console.WriteLine("      skip");
                        }
                        continue;
                    }

                    if (i == 0 && tokens[tokens.Length - 1].Length == 0)
                    {
                        // Never suggest unigrams from empty string:
                        if (VERBOSE)
                        {
                            Console.WriteLine("      skip unigram priors only");
                        }
                        continue;
                    }

                    // Build up "context" ngram:
                    b = new StringBuilder();
                    for (int j = tokens.Length - i - 1; j < tokens.Length - 1; j++)
                    {
                        b.append(' ');
                        b.append(tokens[j]);
                    }
                    string context = b.toString();
                    if (context.Length > 0)
                    {
                        context = context.Substring(1);
                    }
                    if (VERBOSE)
                    {
                        Console.WriteLine("      context='" + context + "'");
                    }
                    long contextCount;
                    if (context.Length == 0)
                    {
                        contextCount = totTokens;
                    }
                    else
                    {
                        //int? count = gramCounts.get(i - 1).get(context);
                        var gramCount = gramCounts[i - 1];
                        int?count     = gramCount.ContainsKey(context) ? gramCount[context] : null;
                        if (count == null)
                        {
                            // We never saw this context:
                            backoff *= FreeTextSuggester.ALPHA;
                            if (VERBOSE)
                            {
                                Console.WriteLine("      skip: never saw context");
                            }
                            continue;
                        }
                        contextCount = count.GetValueOrDefault();
                    }
                    if (VERBOSE)
                    {
                        Console.WriteLine("      contextCount=" + contextCount);
                    }
                    IDictionary <string, int?> model = gramCounts[i];

                    // First pass, gather all predictions for this model:
                    if (VERBOSE)
                    {
                        Console.WriteLine("      find terms w/ prefix=" + tokens[tokens.Length - 1]);
                    }
                    List <Lookup.LookupResult> tmp = new List <Lookup.LookupResult>();
                    foreach (string term in terms)
                    {
                        if (term.StartsWith(tokens[tokens.Length - 1], StringComparison.Ordinal))
                        {
                            if (VERBOSE)
                            {
                                Console.WriteLine("        term=" + term);
                            }
                            if (seen.contains(term))
                            {
                                if (VERBOSE)
                                {
                                    Console.WriteLine("          skip seen");
                                }
                                continue;
                            }
                            string ngram = (context + " " + term).Trim();
                            //Integer count = model.get(ngram);
                            int?count = model.ContainsKey(ngram) ? model[ngram] : null;
                            if (count != null)
                            {
                                // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
                                // return numbers that are greater than long.MaxValue, which results in a negative long number.
                                // This is also the way it is being done in the FreeTextSuggester to work around the issue.
                                Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * ((decimal)backoff * (decimal)count / contextCount)));
                                tmp.Add(lr);
                                if (VERBOSE)
                                {
                                    Console.WriteLine("      add tmp key='" + lr.Key + "' score=" + lr.Value);
                                }
                            }
                        }
                    }

                    // Second pass, trim to only top N, and fold those
                    // into overall suggestions:
                    tmp.Sort(byScoreThenKey);
                    if (tmp.size() > num)
                    {
                        //tmp.subList(num, tmp.size()).clear();
                        tmp.RemoveRange(num, tmp.size() - num);
                    }
                    foreach (Lookup.LookupResult result in tmp)
                    {
                        string key = result.Key.toString();
                        int    idx = key.LastIndexOf(' ');
                        string lastToken;
                        if (idx != -1)
                        {
                            lastToken = key.Substring(idx + 1);
                        }
                        else
                        {
                            lastToken = key;
                        }
                        if (!seen.contains(lastToken))
                        {
                            seen.add(lastToken);
                            expected.Add(result);
                            if (VERBOSE)
                            {
                                Console.WriteLine("      keep key='" + result.Key + "' score=" + result.Value);
                            }
                        }
                    }

                    backoff *= FreeTextSuggester.ALPHA;
                }

                expected.Sort(byScoreThenKey);

                if (expected.size() > num)
                {
                    expected.RemoveRange(num, expected.size() - num);
                }

                // Actual:
                IList <Lookup.LookupResult> actual = sug.DoLookup(query, num);

                if (VERBOSE)
                {
                    Console.WriteLine("  expected: " + expected);
                    Console.WriteLine("    actual: " + actual);
                }

                assertEquals(expected.ToString(), actual.ToString());
            }
        }