Exemplos de código com MultiFields em C# (CSharp)

Exemplo n.º 1

0

Exibir arquivo

        /// <summary>
        /// Returns <see cref="Search.CollectionStatistics"/> for a field.
        /// <para/>
        /// This can be overridden for example, to return a field's statistics
        /// across a distributed collection.
        /// <para/>
        /// @lucene.experimental
        /// </summary>
        public virtual CollectionStatistics CollectionStatistics(string field)
        {
            int  docCount;
            long sumTotalTermFreq;
            long sumDocFreq;

            Debug.Assert(field != null);

            Terms terms = MultiFields.GetTerms(reader, field);

            if (terms == null)
            {
                docCount         = 0;
                sumTotalTermFreq = 0;
                sumDocFreq       = 0;
            }
            else
            {
                docCount         = terms.DocCount;
                sumTotalTermFreq = terms.SumTotalTermFreq;
                sumDocFreq       = terms.SumDocFreq;
            }
            return(new CollectionStatistics(field, reader.MaxDoc, docCount, sumTotalTermFreq, sumDocFreq));
        }

Exemplo n.º 2

0

Exibir arquivo

        public void TestKeepsFirstFilter()
        {
            DuplicateFilter df = new DuplicateFilter(KEY_FIELD);

            df.KeepMode = (KeepMode.KM_USE_FIRST_OCCURRENCE);
            ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs;
            assertTrue("Filtered searching should have found some matches", hits.Length > 0);
            foreach (ScoreDoc hit in hits)
            {
                Document d   = searcher.Doc(hit.Doc);
                string   url = d.Get(KEY_FIELD);
                DocsEnum td  = TestUtil.Docs(Random, reader,
                                             KEY_FIELD,
                                             new BytesRef(url),
                                             MultiFields.GetLiveDocs(reader),
                                             null,
                                             0);

                int lastDoc = 0;
                td.NextDoc();
                lastDoc = td.DocID;
                assertEquals("Duplicate urls should return first doc", lastDoc, hit.Doc);
            }
        }

Exemplo n.º 3

0

Exibir arquivo

        public void AddWithAnalyzerSuccess()
        {
            TestObject t = new TestObject()
            {
                Number = 9876,
                String = "Test Object 9876",
            };

            Assert.AreEqual(0, writer.NumDocs);
            writer.Add(t, new KeywordAnalyzer());
            writer.Commit();
            Assert.AreEqual(1, writer.NumDocs);

            DirectoryReader reader = DirectoryReader.Open(dir);
            int             nTerms = 0;
            Fields          fields = MultiFields.GetFields(reader);

            foreach (string field in fields)
            {
                Terms     tms       = fields.GetTerms(field);
                TermsEnum termsEnum = tms.GetIterator(null);
                BytesRef  text      = termsEnum.Next();
                while (text != null)
                {
                    if (String.Equals("String", field))
                    {
                        string str = text.Utf8ToString();
                        Assert.AreEqual("Test Object 9876", str);
                        nTerms++;
                    }
                    text = termsEnum.Next();
                }
            }

            Assert.AreEqual(1, nTerms);
        }

Exemplo n.º 4

0

Exibir arquivo

        public void UpdateWithKindWithAnalyzerSuccess()
        {
            TestObject t = new TestObject()
            {
                Number = 1234,
                String = "Test Object 1234",
            };

            Assert.AreEqual(0, writer.NumDocs);
            writer.Add <object>(t, new KeywordAnalyzer());
            writer.Commit();
            Assert.AreEqual(1, writer.NumDocs);

            TestObject t2 = new TestObject()
            {
                Number = 2345,
                String = "Something Else 2345",
            };

            writer.Update(t2, DocumentObjectTypeKind.Static, new TermQuery(new Term("String", "Test Object 1234")), new KeywordAnalyzer());
            writer.Commit();
            Assert.AreEqual(2, writer.NumDocs);

            writer.DeleteDocuments <object>(new MatchAllDocsQuery());
            writer.Commit();
            Assert.AreEqual(1, writer.NumDocs);

            TestObject t3 = new TestObject()
            {
                Number = 3456,
                String = "Completely Different 3456",
            };

            writer.Update(t3, DocumentObjectTypeKind.Actual, new TermQuery(new Term("String", "Something Else 2345")), new KeywordAnalyzer());
            writer.Commit();
            Assert.AreEqual(1, writer.NumDocs);

            DirectoryReader reader = DirectoryReader.Open(dir);
            Fields          fields = MultiFields.GetFields(reader);
            int             nTerms = 0;

            foreach (string field in fields)
            {
                Terms     tms       = fields.GetTerms(field);
                TermsEnum termsEnum = tms.GetIterator(null);
                BytesRef  text      = termsEnum.Next();
                while (text != null)
                {
                    if (String.Equals("String", field))
                    {
                        if (String.Equals("Completely Different 3456", text.Utf8ToString()))
                        {
                            nTerms++;
                        }
                    }
                    text = termsEnum.Next();
                }
            }

            Assert.AreEqual(1, nTerms);
        }

Exemplo n.º 5

0

Exibir arquivo

        /// <summary>
        /// Provide spelling corrections based on several parameters.
        /// </summary>
        /// <param name="term"> The term to suggest spelling corrections for </param>
        /// <param name="numSug"> The maximum number of spelling corrections </param>
        /// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param>
        /// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param>
        /// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param>
        /// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param>
        /// <param name="spare"> a chars scratch </param>
        /// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns>
        /// <exception cref="System.IO.IOException"> If I/O related errors occur </exception>
        protected internal virtual IEnumerable <ScoreTerm> SuggestSimilar(Term term, int numSug, IndexReader ir,
                                                                          int docfreq, int editDistance, float accuracy, CharsRef spare)
        {
            var atts = new AttributeSource();
            IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
            Terms terms = MultiFields.GetTerms(ir, term.Field);

            if (terms == null)
            {
                return(new List <ScoreTerm>());
            }
            FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true);

            var stQueue = new Support.PriorityQueue <ScoreTerm>();

            BytesRef        queryTerm = new BytesRef(term.Text());
            BytesRef        candidateTerm;
            ScoreTerm       st       = new ScoreTerm();
            IBoostAttribute boostAtt = e.Attributes.AddAttribute <IBoostAttribute>();

            while ((candidateTerm = e.Next()) != null)
            {
                float boost = boostAtt.Boost;
                // ignore uncompetitive hits
                if (stQueue.Count >= numSug && boost <= stQueue.Peek().Boost)
                {
                    continue;
                }

                // ignore exact match of the same term
                if (queryTerm.BytesEquals(candidateTerm))
                {
                    continue;
                }

                int df = e.DocFreq;

                // check docFreq if required
                if (df <= docfreq)
                {
                    continue;
                }

                float  score;
                string termAsString;
                if (distance == INTERNAL_LEVENSHTEIN)
                {
                    // delay creating strings until the end
                    termAsString = null;
                    // undo FuzzyTermsEnum's scale factor for a real scaled lev score
                    score = boost / e.ScaleFactor + e.MinSimilarity;
                }
                else
                {
                    UnicodeUtil.UTF8toUTF16(candidateTerm, spare);
                    termAsString = spare.ToString();
                    score        = distance.GetDistance(term.Text(), termAsString);
                }

                if (score < accuracy)
                {
                    continue;
                }

                // add new entry in PQ
                st.Term         = BytesRef.DeepCopyOf(candidateTerm);
                st.Boost        = boost;
                st.Docfreq      = df;
                st.TermAsString = termAsString;
                st.Score        = score;
                stQueue.Offer(st);
                // possibly drop entries from queue
                st = (stQueue.Count > numSug) ? stQueue.Poll() : new ScoreTerm();
                maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Count >= numSug) ? stQueue.Peek().Boost : float.NegativeInfinity;
            }

            return(stQueue);
        }

Exemplo n.º 6

0

Exibir arquivo

Arquivo: TestSurrogates.cs Projeto: ywscr/lucenenet

        // randomly seeks to term that we know exists, then next's
        // from there
        private void DoTestSeekExists(Random r, IList <Term> fieldTerms, IndexReader reader)
        {
            IDictionary <string, TermsEnum> tes = new Dictionary <string, TermsEnum>();

            // Test random seek to existing term, then enum:
            if (Verbose)
            {
                Console.WriteLine("\nTEST: top now seek");
            }

            int num = AtLeast(100);

            for (int iter = 0; iter < num; iter++)
            {
                // pick random field+term
                int    spot  = r.Next(fieldTerms.Count);
                Term   term  = fieldTerms[spot];
                string field = term.Field;

                if (Verbose)
                {
                    Console.WriteLine("TEST: exist seek field=" + field + " term=" + UnicodeUtil.ToHexString(term.Text));
                }

                // seek to it
                if (!tes.TryGetValue(field, out TermsEnum te))
                {
                    te         = MultiFields.GetTerms(reader, field).GetEnumerator();
                    tes[field] = te;
                }

                if (Verbose)
                {
                    Console.WriteLine("  done get enum");
                }

                // seek should find the term
                Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(term.Bytes));

                // now .next() this many times:
                int ct = TestUtil.NextInt32(r, 5, 100);
                for (int i = 0; i < ct; i++)
                {
                    if (Verbose)
                    {
                        Console.WriteLine("TEST: now next()");
                    }
                    if (1 + spot + i >= fieldTerms.Count)
                    {
                        break;
                    }
                    term = fieldTerms[1 + spot + i];
                    if (!term.Field.Equals(field, StringComparison.Ordinal))
                    {
                        Assert.IsFalse(te.MoveNext());
                        break;
                    }
                    else
                    {
                        Assert.IsTrue(te.MoveNext());
                        BytesRef t = te.Term;

                        if (Verbose)
                        {
                            Console.WriteLine("  got term=" + (t == null ? null : UnicodeUtil.ToHexString(t.Utf8ToString())));
                            Console.WriteLine("       exp=" + UnicodeUtil.ToHexString(term.Text.ToString()));
                        }

                        Assert.AreEqual(term.Bytes, t);
                    }
                }
            }
        }

Exemplo n.º 7

0

Exibir arquivo

Arquivo: Test10KPulsings.cs Projeto: willCode2Surf/lucenenet

        /// <summary>
        /// a variant, that uses pulsing, but uses a high TF to force pass thru to the underlying codec
        /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void test10kNotPulsed() throws Exception
        public virtual void test10kNotPulsed()
        {
            // we always run this test with pulsing codec.
            int   freqCutoff = TestUtil.Next(random(), 1, 10);
            Codec cp         = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(freqCutoff));

            File f = createTempDir("10knotpulsed");
            BaseDirectoryWrapper dir = newFSDirectory(f);

            dir.CheckIndexOnClose = false;     // we do this ourselves explicitly
            RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp));

            Document  document = new Document();
            FieldType ft       = new FieldType(TextField.TYPE_STORED);

            switch (TestUtil.Next(random(), 0, 2))
            {
            case 0:
                ft.IndexOptions = IndexOptions.DOCS_ONLY;
                break;

            case 1:
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS;
                break;

            default:
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
                break;
            }

            Field field = newField("field", "", ft);

            document.add(field);

            NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT));

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int freq = freqCutoff + 1;
            int freq = freqCutoff + 1;

            for (int i = 0; i < 10050; i++)
            {
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < freq; j++)
                {
                    sb.Append(df.format(i));
                    sb.Append(' ');     // whitespace
                }
                field.StringValue = sb.ToString();
                iw.addDocument(document);
            }

            IndexReader ir = iw.Reader;

            iw.close();

            TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null);
            DocsEnum  de = null;

            for (int i = 0; i < 10050; i++)
            {
                string expected = df.format(i);
                assertEquals(expected, te.next().utf8ToString());
                de = TestUtil.docs(random(), te, null, de, DocsEnum.FLAG_NONE);
                assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc());
            }
            ir.close();

            TestUtil.checkIndex(dir);
            dir.close();
        }

Exemplo n.º 8

0

Exibir arquivo

        public virtual void Test()
        {
            Directory    dir      = NewDirectory();
            MockAnalyzer analyzer = new MockAnalyzer(Random);

            analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH);
            RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, dir, analyzer);
            LineFileDocs docs         = new LineFileDocs(Random, DefaultCodecSupportsDocValues);
            int          charsToIndex = AtLeast(100000);
            int          charsIndexed = 0;

            //System.out.println("bytesToIndex=" + charsToIndex);
            while (charsIndexed < charsToIndex)
            {
                Document doc = docs.NextDoc();
                charsIndexed += doc.Get("body").Length;
                w.AddDocument(doc);
                //System.out.println("  bytes=" + charsIndexed + " add: " + doc);
            }
            IndexReader r = w.GetReader();

            //System.out.println("numDocs=" + r.NumDocs);
            w.Dispose();

            IndexSearcher s         = NewSearcher(r);
            Terms         terms     = MultiFields.GetFields(r).GetTerms("body");
            int           termCount = 0;
            TermsEnum     termsEnum = terms.GetIterator(null);

            while (termsEnum.Next() != null)
            {
                termCount++;
            }
            Assert.IsTrue(termCount > 0);

            // Target ~10 terms to search:
            double chance = 10.0 / termCount;

            termsEnum = terms.GetIterator(termsEnum);
            IDictionary <BytesRef, TopDocs> answers = new Dictionary <BytesRef, TopDocs>();

            while (termsEnum.Next() != null)
            {
                if (Random.NextDouble() <= chance)
                {
                    BytesRef term = BytesRef.DeepCopyOf(termsEnum.Term);
                    answers[term] = s.Search(new TermQuery(new Term("body", term)), 100);
                }
            }

            if (answers.Count > 0)
            {
                CountdownEvent startingGun = new CountdownEvent(1);
                int            numThreads  = TestUtil.NextInt32(Random, 2, 5);
                ThreadJob[]    threads     = new ThreadJob[numThreads];
                for (int threadID = 0; threadID < numThreads; threadID++)
                {
                    ThreadJob thread = new ThreadAnonymousInnerClassHelper(this, s, answers, startingGun);
                    threads[threadID] = thread;
                    thread.Start();
                }
                startingGun.Signal();
                foreach (ThreadJob thread in threads)
                {
                    thread.Join();
                }
            }
            r.Dispose();
            dir.Dispose();
        }

Exemplo n.º 9

0

Exibir arquivo

Arquivo: ThreadedIndexingAndSearchingTestCase.cs Projeto: Kingpl2016/lucenenet

 public override void Run()
 {
     if (VERBOSE)
     {
         Console.WriteLine(Thread.CurrentThread.Name + ": launch search thread");
     }
     while (Environment.TickCount < stopTimeMS)
     {
         try
         {
             IndexSearcher s = outerInstance.GetCurrentSearcher();
             try
             {
                 // Verify 1) IW is correctly setting
                 // diagnostics, and 2) segment warming for
                 // merged segments is actually happening:
                 foreach (AtomicReaderContext sub in s.IndexReader.Leaves)
                 {
                     SegmentReader segReader = (SegmentReader)sub.Reader;
                     IDictionary <string, string> diagnostics = segReader.SegmentInfo.Info.Diagnostics;
                     assertNotNull(diagnostics);
                     string source;
                     diagnostics.TryGetValue("source", out source);
                     assertNotNull(source);
                     if (source.Equals("merge", StringComparison.Ordinal))
                     {
                         assertTrue("sub reader " + sub + " wasn't warmed: warmed=" + outerInstance.warmed + " diagnostics=" + diagnostics + " si=" + segReader.SegmentInfo,
                                    !outerInstance.m_assertMergedSegmentsWarmed || outerInstance.warmed.ContainsKey(segReader.core));
                     }
                 }
                 if (s.IndexReader.NumDocs > 0)
                 {
                     outerInstance.SmokeTestSearcher(s);
                     Fields fields = MultiFields.GetFields(s.IndexReader);
                     if (fields == null)
                     {
                         continue;
                     }
                     Terms terms = fields.GetTerms("body");
                     if (terms == null)
                     {
                         continue;
                     }
                     TermsEnum termsEnum     = terms.GetIterator(null);
                     int       seenTermCount = 0;
                     int       shift;
                     int       trigger;
                     if (totTermCount.Get() < 30)
                     {
                         shift   = 0;
                         trigger = 1;
                     }
                     else
                     {
                         trigger = totTermCount.Get() / 30;
                         shift   = Random.Next(trigger);
                     }
                     while (Environment.TickCount < stopTimeMS)
                     {
                         BytesRef term = termsEnum.Next();
                         if (term == null)
                         {
                             totTermCount.Set(seenTermCount);
                             break;
                         }
                         seenTermCount++;
                         // search 30 terms
                         if ((seenTermCount + shift) % trigger == 0)
                         {
                             //if (VERBOSE) {
                             //System.out.println(Thread.currentThread().getName() + " now search body:" + term.Utf8ToString());
                             //}
                             totHits.AddAndGet(outerInstance.RunQuery(s, new TermQuery(new Term("body", term))));
                         }
                     }
                     //if (VERBOSE) {
                     //System.out.println(Thread.currentThread().getName() + ": search done");
                     //}
                 }
             }
             finally
             {
                 outerInstance.ReleaseSearcher(s);
             }
         }
         catch (Exception t)
         {
             Console.WriteLine(Thread.CurrentThread.Name + ": hit exc");
             outerInstance.m_failed.Set(true);
             Console.WriteLine(t.ToString());
             throw new Exception(t.ToString(), t);
         }
     }
 }

Exemplo n.º 10

0

Exibir arquivo

Arquivo: FreeTextSuggester.cs Projeto: zhangbo27/lucenenet

        /// <summary>
        /// Build the suggest index, using up to the specified
        ///  amount of temporary RAM while building.  Note that
        ///  the weights for the suggestions are ignored.
        /// </summary>
        public virtual void Build(IInputIterator iterator, double ramBufferSizeMB)
        {
            if (iterator.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (iterator.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            string prefix    = this.GetType().Name;
            var    directory = OfflineSorter.DefaultTempDir();
            // TODO: messy ... java7 has Files.createTempDirectory
            // ... but 4.x is java6:
            DirectoryInfo tempIndexPath = null;
            Random        random        = new Random();

            while (true)
            {
                tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + random.Next(int.MaxValue)));
                tempIndexPath.Create();
                if (System.IO.Directory.Exists(tempIndexPath.FullName))
                {
                    break;
                }
            }

            using (Directory dir = FSDirectory.Open(tempIndexPath))
            {
#pragma warning disable 612, 618
                IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer);
#pragma warning restore 612, 618
                iwc.SetOpenMode(OpenMode.CREATE);
                iwc.SetRAMBufferSizeMB(ramBufferSizeMB);
                IndexWriter writer = new IndexWriter(dir, iwc);

                var ft = new FieldType(TextField.TYPE_NOT_STORED);
                // TODO: if only we had IndexOptions.TERMS_ONLY...
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS;
                ft.OmitNorms    = true;
                ft.Freeze();

                Document doc   = new Document();
                Field    field = new Field("body", "", ft);
                doc.Add(field);

                totTokens = 0;
                IndexReader reader = null;

                bool success = false;
                count = 0;
                try
                {
                    while (true)
                    {
                        BytesRef surfaceForm = iterator.Next();
                        if (surfaceForm == null)
                        {
                            break;
                        }
                        field.SetStringValue(surfaceForm.Utf8ToString());
                        writer.AddDocument(doc);
                        count++;
                    }
                    reader = DirectoryReader.Open(writer, false);

                    Terms terms = MultiFields.GetTerms(reader, "body");
                    if (terms == null)
                    {
                        throw new System.ArgumentException("need at least one suggestion");
                    }

                    // Move all ngrams into an FST:
                    TermsEnum termsEnum = terms.GetIterator(null);

                    Outputs <long?> outputs = PositiveInt32Outputs.Singleton;
                    Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs);

                    Int32sRef scratchInts = new Int32sRef();
                    while (true)
                    {
                        BytesRef term = termsEnum.Next();
                        if (term == null)
                        {
                            break;
                        }
                        int ngramCount = CountGrams(term);
                        if (ngramCount > grams)
                        {
                            throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
                        }
                        if (ngramCount == 1)
                        {
                            totTokens += termsEnum.TotalTermFreq;
                        }

                        builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq));
                    }

                    fst = builder.Finish();
                    if (fst == null)
                    {
                        throw new System.ArgumentException("need at least one suggestion");
                    }
                    //System.out.println("FST: " + fst.getNodeCount() + " nodes");

                    /*
                     * PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
                     * Util.toDot(fst, pw, true, true);
                     * pw.close();
                     */

                    success = true;
                }
                finally
                {
                    try
                    {
                        if (success)
                        {
                            IOUtils.Dispose(writer, reader);
                        }
                        else
                        {
                            IOUtils.DisposeWhileHandlingException(writer, reader);
                        }
                    }
                    finally
                    {
                        foreach (string file in dir.ListAll())
                        {
                            FileInfo path = new FileInfo(Path.Combine(tempIndexPath.FullName, file));
                            try
                            {
                                path.Delete();
                            }
                            catch (Exception e)
                            {
                                throw new InvalidOperationException("failed to remove " + path, e);
                            }
                        }

                        try
                        {
                            tempIndexPath.Delete();
                        }
                        catch (Exception e)
                        {
                            throw new InvalidOperationException("failed to remove " + tempIndexPath, e);
                        }
                    }
                }
            }
        }

Exemplo n.º 11

0

Exibir arquivo

        /// <summary>
        /// Make sure we skip wicked long terms.
        /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testWickedLongTerm() throws java.io.IOException
        public virtual void testWickedLongTerm()
        {
            RAMDirectory dir    = new RAMDirectory();
            IndexWriter  writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)));

            char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
            Arrays.fill(chars, 'x');
            Document doc = new Document();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final String bigTerm = new String(chars);
            string bigTerm = new string(chars);

            // This produces a too-long term:
            string contents = "abc xyz x" + bigTerm + " another term";

            doc.add(new TextField("content", contents, Field.Store.NO));
            writer.addDocument(doc);

            // Make sure we can add another normal document
            doc = new Document();
            doc.add(new TextField("content", "abc bbb ccc", Field.Store.NO));
            writer.addDocument(doc);
            writer.close();

            IndexReader reader = IndexReader.open(dir);

            // Make sure all terms < max size were indexed
            assertEquals(2, reader.docFreq(new Term("content", "abc")));
            assertEquals(1, reader.docFreq(new Term("content", "bbb")));
            assertEquals(1, reader.docFreq(new Term("content", "term")));
            assertEquals(1, reader.docFreq(new Term("content", "another")));

            // Make sure position is still incremented when
            // massive term is skipped:
            DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "content", new BytesRef("another"));

            assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            assertEquals(1, tps.freq());
            assertEquals(3, tps.nextPosition());

            // Make sure the doc that has the massive term is in
            // the index:
            assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());

            reader.close();

            // Make sure we can add a document with exactly the
            // maximum length term, and search on that term:
            doc = new Document();
            doc.add(new TextField("content", bigTerm, Field.Store.NO));
            ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);

            sa.MaxTokenLength = 100000;
            writer            = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
            writer.addDocument(doc);
            writer.close();
            reader = IndexReader.open(dir);
            assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
            reader.close();

            dir.close();
        }

Exemplo n.º 12

0

Exibir arquivo

        public virtual void TestCaching()
        {
            Directory         dir    = new RAMDirectory();
            RandomIndexWriter writer = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, dir);
            Document    doc    = new Document();
            TokenStream stream = new TokenStreamAnonymousInnerClassHelper(this);

            stream = new CachingTokenFilter(stream);

            doc.Add(new TextField("preanalyzed", stream));

            // 1) we consume all tokens twice before we add the doc to the index
            CheckTokens(stream);
            stream.Reset();
            CheckTokens(stream);

            // 2) now add the document to the index and verify if all tokens are indexed
            //    don't reset the stream here, the DocumentWriter should do that implicitly
            writer.AddDocument(doc);

            IndexReader          reader        = writer.GetReader();
            DocsAndPositionsEnum termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term1"));

            Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            Assert.AreEqual(1, termPositions.Freq);
            Assert.AreEqual(0, termPositions.NextPosition());

            termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term2"));
            Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            Assert.AreEqual(2, termPositions.Freq);
            Assert.AreEqual(1, termPositions.NextPosition());
            Assert.AreEqual(3, termPositions.NextPosition());

            termPositions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "preanalyzed", new BytesRef("term3"));
            Assert.IsTrue(termPositions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            Assert.AreEqual(1, termPositions.Freq);
            Assert.AreEqual(2, termPositions.NextPosition());
            reader.Dispose();
            writer.Dispose();
            // 3) reset stream and consume tokens again
            stream.Reset();
            CheckTokens(stream);
            dir.Dispose();
        }

Exemplo n.º 13

0

Exibir arquivo

 /// <summary>
 /// Creates an iterator over term, weight and payload fields from the lucene
 /// index. Setting <paramref name="hasPayloads"/> to <c>false</c>, implies an enumerator
 /// over only term and weight.
 /// </summary>
 public DocumentInputEnumerator(DocumentDictionary documentDictionary, bool hasPayloads, bool hasContexts)
 {
     this.outerInstance = documentDictionary;
     this.hasPayloads   = hasPayloads;
     this.hasContexts   = hasContexts;
     docCount           = documentDictionary.m_reader.MaxDoc - 1;
     weightValues       = (documentDictionary.weightField != null) ? MultiDocValues.GetNumericValues(documentDictionary.m_reader, documentDictionary.weightField) : null;
     liveDocs           = (documentDictionary.m_reader.Leaves.Count > 0) ? MultiFields.GetLiveDocs(documentDictionary.m_reader) : null;
     relevantFields     = GetRelevantFields(new string[] { documentDictionary.field, documentDictionary.weightField, documentDictionary.m_payloadField, documentDictionary.m_contextsField });
 }

Exemplo n.º 14

0

Exibir arquivo

Arquivo: ReadTask.cs Projeto: zalintyre/lucenenet

        public override int DoLogic()
        {
            int res = 0;

            // open reader or use existing one
            IndexSearcher searcher = RunData.GetIndexSearcher();

            IndexReader reader;

            bool closeSearcher;

            if (searcher == null)
            {
                // open our own reader
                Directory dir = RunData.Directory;
                reader        = DirectoryReader.Open(dir);
                searcher      = new IndexSearcher(reader);
                closeSearcher = true;
            }
            else
            {
                // use existing one; this passes +1 ref to us
                reader        = searcher.IndexReader;
                closeSearcher = false;
            }

            // optionally warm and add num docs traversed to count
            if (WithWarm)
            {
                Document doc      = null;
                IBits    liveDocs = MultiFields.GetLiveDocs(reader);
                for (int m = 0; m < reader.MaxDoc; m++)
                {
                    if (null == liveDocs || liveDocs.Get(m))
                    {
                        doc  = reader.Document(m);
                        res += (doc == null ? 0 : 1);
                    }
                }
            }

            if (WithSearch)
            {
                res++;
                Query   q       = queryMaker.MakeQuery();
                Sort    sort    = Sort;
                TopDocs hits    = null;
                int     numHits = NumHits;
                if (numHits > 0)
                {
                    if (WithCollector == false)
                    {
                        if (sort != null)
                        {
                            // TODO: instead of always passing false we
                            // should detect based on the query; if we make
                            // the IndexSearcher search methods that take
                            // Weight public again, we can go back to
                            // pulling the Weight ourselves:
                            TopFieldCollector collector = TopFieldCollector.Create(sort, numHits,
                                                                                   true, WithScore,
                                                                                   WithMaxScore,
                                                                                   false);
                            searcher.Search(q, null, collector);
                            hits = collector.GetTopDocs();
                        }
                        else
                        {
                            hits = searcher.Search(q, numHits);
                        }
                    }
                    else
                    {
                        ICollector collector = CreateCollector();
                        searcher.Search(q, null, collector);
                        //hits = collector.topDocs();
                    }

                    string printHitsField = RunData.Config.Get("print.hits.field", null);
                    if (hits != null && printHitsField != null && printHitsField.Length > 0)
                    {
                        Console.WriteLine("totalHits = " + hits.TotalHits);
                        Console.WriteLine("maxDoc()  = " + reader.MaxDoc);
                        Console.WriteLine("numDocs() = " + reader.NumDocs);
                        for (int i = 0; i < hits.ScoreDocs.Length; i++)
                        {
                            int      docID = hits.ScoreDocs[i].Doc;
                            Document doc   = reader.Document(docID);
                            Console.WriteLine("  " + i + ": doc=" + docID + " score=" + hits.ScoreDocs[i].Score + " " + printHitsField + " =" + doc.Get(printHitsField));
                        }
                    }

                    if (WithTraverse)
                    {
                        ScoreDoc[] scoreDocs     = hits.ScoreDocs;
                        int        traversalSize = Math.Min(scoreDocs.Length, TraversalSize);

                        if (traversalSize > 0)
                        {
                            bool                 retrieve     = WithRetrieve;
                            int                  numHighlight = Math.Min(NumToHighlight, scoreDocs.Length);
                            Analyzer             analyzer     = RunData.Analyzer;
                            BenchmarkHighlighter highlighter  = null;
                            if (numHighlight > 0)
                            {
                                highlighter = GetBenchmarkHighlighter(q);
                            }
                            for (int m = 0; m < traversalSize; m++)
                            {
                                int id = scoreDocs[m].Doc;
                                res++;
                                if (retrieve)
                                {
                                    Document document = RetrieveDoc(reader, id);
                                    res += document != null ? 1 : 0;
                                    if (numHighlight > 0 && m < numHighlight)
                                    {
                                        ICollection <string> fieldsToHighlight = GetFieldsToHighlight(document);
                                        foreach (string field in fieldsToHighlight)
                                        {
                                            string text = document.Get(field);
                                            res += highlighter.DoHighlight(reader, id, field, document, analyzer, text);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }

            if (closeSearcher)
            {
                reader.Dispose();
            }
            else
            {
                // Release our +1 ref from above
                reader.DecRef();
            }
            return(res);
        }

Exemplo n.º 15

0

Exibir arquivo

        public virtual void TestMutipleDocument()
        {
            RAMDirectory dir    = new RAMDirectory();
            IndexWriter  writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new KeywordAnalyzer()));
            Document     doc    = new Document();

            doc.Add(new TextField("partnum", "Q36", Field.Store.YES));
            writer.AddDocument(doc);
            doc = new Document();
            doc.Add(new TextField("partnum", "Q37", Field.Store.YES));
            writer.AddDocument(doc);
            writer.Dispose();

            IndexReader reader = DirectoryReader.Open(dir);
            DocsEnum    td     = TestUtil.Docs(Random(), reader, "partnum", new BytesRef("Q36"), MultiFields.GetLiveDocs(reader), null, 0);

            assertTrue(td.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            td = TestUtil.Docs(Random(), reader, "partnum", new BytesRef("Q37"), MultiFields.GetLiveDocs(reader), null, 0);
            assertTrue(td.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
        }

Exemplo n.º 16

0

Exibir arquivo

Arquivo: FreeTextSuggester.cs Projeto: zalintyre/lucenenet

        /// <summary>
        /// Build the suggest index, using up to the specified
        /// amount of temporary RAM while building.  Note that
        /// the weights for the suggestions are ignored.
        /// </summary>
        public virtual void Build(IInputEnumerator enumerator, double ramBufferSizeMB)
        {
            if (enumerator.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }

            string prefix    = this.GetType().Name;
            var    directory = OfflineSorter.DefaultTempDir();

            // LUCENENET specific - using GetRandomFileName() instead of picking a random int
            DirectoryInfo tempIndexPath = null;

            while (true)
            {
                tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName())));
                tempIndexPath.Create();
                if (System.IO.Directory.Exists(tempIndexPath.FullName))
                {
                    break;
                }
            }

            Directory dir = FSDirectory.Open(tempIndexPath);

            try
            {
#pragma warning disable 612, 618
                IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer);
#pragma warning restore 612, 618
                iwc.SetOpenMode(OpenMode.CREATE);
                iwc.SetRAMBufferSizeMB(ramBufferSizeMB);
                IndexWriter writer = new IndexWriter(dir, iwc);

                var ft = new FieldType(TextField.TYPE_NOT_STORED);
                // TODO: if only we had IndexOptions.TERMS_ONLY...
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS;
                ft.OmitNorms    = true;
                ft.Freeze();

                Document doc   = new Document();
                Field    field = new Field("body", "", ft);
                doc.Add(field);

                totTokens = 0;
                IndexReader reader = null;

                bool success = false;
                count = 0;
                try
                {
                    while (enumerator.MoveNext())
                    {
                        BytesRef surfaceForm = enumerator.Current;
                        field.SetStringValue(surfaceForm.Utf8ToString());
                        writer.AddDocument(doc);
                        count++;
                    }

                    reader = DirectoryReader.Open(writer, false);

                    Terms terms = MultiFields.GetTerms(reader, "body");
                    if (terms == null)
                    {
                        throw new ArgumentException("need at least one suggestion");
                    }

                    // Move all ngrams into an FST:
                    TermsEnum termsEnum = terms.GetEnumerator(null);

                    Outputs <long?> outputs = PositiveInt32Outputs.Singleton;
                    Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs);

                    Int32sRef scratchInts = new Int32sRef();
                    while (termsEnum.MoveNext())
                    {
                        BytesRef term       = termsEnum.Term;
                        int      ngramCount = CountGrams(term);
                        if (ngramCount > grams)
                        {
                            throw new ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
                        }
                        if (ngramCount == 1)
                        {
                            totTokens += termsEnum.TotalTermFreq;
                        }

                        builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq));
                    }

                    fst = builder.Finish();
                    if (fst == null)
                    {
                        throw new ArgumentException("need at least one suggestion");
                    }
                    //System.out.println("FST: " + fst.getNodeCount() + " nodes");

                    /*
                     * PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
                     * Util.toDot(fst, pw, true, true);
                     * pw.close();
                     */

                    success = true;
                }
                finally
                {
                    if (success)
                    {
                        IOUtils.Dispose(writer, reader);
                    }
                    else
                    {
                        IOUtils.DisposeWhileHandlingException(writer, reader);
                    }
                }
            }
            finally
            {
                try
                {
                    IOUtils.Dispose(dir);
                }
                finally
                {
                    // LUCENENET specific - since we are removing the entire directory anyway,
                    // it doesn't make sense to first do a loop in order remove the files.
                    // Let the System.IO.Directory.Delete() method handle that.
                    // We also need to dispose the Directory instance first before deleting from disk.
                    try
                    {
                        System.IO.Directory.Delete(tempIndexPath.FullName, true);
                    }
                    catch (Exception e)
                    {
                        throw new InvalidOperationException("failed to remove " + tempIndexPath, e);
                    }
                }
            }
        }

Exemplo n.º 17

0

Exibir arquivo

 /// <summary>
 /// Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all
 /// indexed fields from terms with a document frequency greater than the given
 /// maxDocFreq
 /// </summary>
 /// <param name="matchVersion"> Version to be used in <seealso cref="StopFilter"/> </param>
 /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param>
 /// <param name="indexReader"> IndexReader to identify the stopwords from </param>
 /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param>
 /// <exception cref="IOException"> Can be thrown while reading from the IndexReader </exception>
 public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer @delegate, IndexReader indexReader, int maxDocFreq)
     : this(matchVersion, @delegate, indexReader, MultiFields.GetIndexedFields(indexReader), maxDocFreq)
 {
 }

Exemplo n.º 18

0

Exibir arquivo

        public virtual void TestSetPosition()
        {
            Analyzer          analyzer = new AnalyzerAnonymousClass(this);
            Directory         store    = NewDirectory();
            RandomIndexWriter writer   = new RandomIndexWriter(Random, store, analyzer);
            Document          d        = new Document();

            d.Add(NewTextField("field", "bogus", Field.Store.YES));
            writer.AddDocument(d);
            IndexReader reader = writer.GetReader();

            writer.Dispose();

            IndexSearcher searcher = NewSearcher(reader);

            DocsAndPositionsEnum pos = MultiFields.GetTermPositionsEnum(searcher.IndexReader, MultiFields.GetLiveDocs(searcher.IndexReader), "field", new BytesRef("1"));

            pos.NextDoc();
            // first token should be at position 0
            Assert.AreEqual(0, pos.NextPosition());

            pos = MultiFields.GetTermPositionsEnum(searcher.IndexReader, MultiFields.GetLiveDocs(searcher.IndexReader), "field", new BytesRef("2"));
            pos.NextDoc();
            // second token should be at position 2
            Assert.AreEqual(2, pos.NextPosition());

            PhraseQuery q;

            ScoreDoc[] hits;

            q = new PhraseQuery();
            q.Add(new Term("field", "1"));
            q.Add(new Term("field", "2"));
            hits = searcher.Search(q, null, 1000).ScoreDocs;
            Assert.AreEqual(0, hits.Length);

            // same as previous, just specify positions explicitely.
            q = new PhraseQuery();
            q.Add(new Term("field", "1"), 0);
            q.Add(new Term("field", "2"), 1);
            hits = searcher.Search(q, null, 1000).ScoreDocs;
            Assert.AreEqual(0, hits.Length);

            // specifying correct positions should find the phrase.
            q = new PhraseQuery();
            q.Add(new Term("field", "1"), 0);
            q.Add(new Term("field", "2"), 2);
            hits = searcher.Search(q, null, 1000).ScoreDocs;
            Assert.AreEqual(1, hits.Length);

            q = new PhraseQuery();
            q.Add(new Term("field", "2"));
            q.Add(new Term("field", "3"));
            hits = searcher.Search(q, null, 1000).ScoreDocs;
            Assert.AreEqual(1, hits.Length);

            q = new PhraseQuery();
            q.Add(new Term("field", "3"));
            q.Add(new Term("field", "4"));
            hits = searcher.Search(q, null, 1000).ScoreDocs;
            Assert.AreEqual(0, hits.Length);

            // phrase query would find it when correct positions are specified.
            q = new PhraseQuery();
            q.Add(new Term("field", "3"), 0);
            q.Add(new Term("field", "4"), 0);
            hits = searcher.Search(q, null, 1000).ScoreDocs;
            Assert.AreEqual(1, hits.Length);

            // phrase query should fail for non existing searched term
            // even if there exist another searched terms in the same searched position.
            q = new PhraseQuery();
            q.Add(new Term("field", "3"), 0);
            q.Add(new Term("field", "9"), 0);
            hits = searcher.Search(q, null, 1000).ScoreDocs;
            Assert.AreEqual(0, hits.Length);

            // multi-phrase query should succed for non existing searched term
            // because there exist another searched terms in the same searched position.
            MultiPhraseQuery mq = new MultiPhraseQuery();

            mq.Add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0);
            hits = searcher.Search(mq, null, 1000).ScoreDocs;
            Assert.AreEqual(1, hits.Length);

            q = new PhraseQuery();
            q.Add(new Term("field", "2"));
            q.Add(new Term("field", "4"));
            hits = searcher.Search(q, null, 1000).ScoreDocs;
            Assert.AreEqual(1, hits.Length);

            q = new PhraseQuery();
            q.Add(new Term("field", "3"));
            q.Add(new Term("field", "5"));
            hits = searcher.Search(q, null, 1000).ScoreDocs;
            Assert.AreEqual(1, hits.Length);

            q = new PhraseQuery();
            q.Add(new Term("field", "4"));
            q.Add(new Term("field", "5"));
            hits = searcher.Search(q, null, 1000).ScoreDocs;
            Assert.AreEqual(1, hits.Length);

            q = new PhraseQuery();
            q.Add(new Term("field", "2"));
            q.Add(new Term("field", "5"));
            hits = searcher.Search(q, null, 1000).ScoreDocs;
            Assert.AreEqual(0, hits.Length);

            reader.Dispose();
            store.Dispose();
        }

Exemplo n.º 19

0

Exibir arquivo

        public virtual void TestWickedLongTerm()
        {
            using (RAMDirectory dir = new RAMDirectory())
            {
                char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
                Arrays.Fill(chars, 'x');

                string   bigTerm = new string(chars);
                Document doc     = new Document();

                using (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT))))
                {
                    // This produces a too-long term:
                    string contents = "abc xyz x" + bigTerm + " another term";
                    doc.Add(new TextField("content", contents, Field.Store.NO));
                    writer.AddDocument(doc);

                    // Make sure we can add another normal document
                    doc = new Document();
                    doc.Add(new TextField("content", "abc bbb ccc", Field.Store.NO));
                    writer.AddDocument(doc);
                }
#pragma warning disable 612, 618
                using (IndexReader reader = IndexReader.Open(dir))
#pragma warning restore 612, 618
                {
                    // Make sure all terms < max size were indexed
                    assertEquals(2, reader.DocFreq(new Term("content", "abc")));
                    assertEquals(1, reader.DocFreq(new Term("content", "bbb")));
                    assertEquals(1, reader.DocFreq(new Term("content", "term")));
                    assertEquals(1, reader.DocFreq(new Term("content", "another")));

                    // Make sure position is still incremented when
                    // massive term is skipped:
                    DocsAndPositionsEnum tps = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "content", new BytesRef("another"));
                    assertTrue(tps.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                    assertEquals(1, tps.Freq);
                    assertEquals(3, tps.NextPosition());

                    // Make sure the doc that has the massive term is in
                    // the index:
                    assertEquals("document with wicked long term should is not in the index!", 2, reader.NumDocs);
                }

                // Make sure we can add a document with exactly the
                // maximum length term, and search on that term:
                doc = new Document();
                doc.Add(new TextField("content", bigTerm, Field.Store.NO));
                ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
                sa.MaxTokenLength = 100000;
                using (var writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa)))
                {
                    writer.AddDocument(doc);
                }
#pragma warning disable 612, 618
                using (var reader = IndexReader.Open(dir))
#pragma warning restore 612, 618
                {
                    assertEquals(1, reader.DocFreq(new Term("content", bigTerm)));
                }
            }
        }

Exemplo n.º 20

0

Exibir arquivo

 /// <summary>
 /// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for all
 /// indexed fields from terms with a document frequency percentage greater than
 /// the given <paramref name="maxPercentDocs"/>
 /// </summary>
 /// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
 /// <param name="delegate"> <see cref="Analyzer"/> whose <see cref="TokenStream"/> will be filtered </param>
 /// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
 /// <param name="maxPercentDocs"> The maximum percentage (between 0.0 and 1.0) of index documents which
 ///                      contain a term, after which the word is considered to be a stop word </param>
 /// <exception cref="IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
 public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, float maxPercentDocs)
     : this(matchVersion, @delegate, indexReader, MultiFields.GetIndexedFields(indexReader), maxPercentDocs)
 {
 }

Exemplo n.º 21

0

Exibir arquivo

        public override int GetOrdinal(FacetLabel cp)
        {
            EnsureOpen();
            if (cp.Length == 0)
            {
                return(ROOT_ORDINAL);
            }

            // First try to find the answer in the LRU cache:

            // LUCENENET: Despite LRUHashMap being thread-safe, we get much better performance
            // if reads are separated from writes.
            ordinalCacheLock.EnterReadLock();
            try
            {
                if (ordinalCache.TryGetValue(cp, out Int32Class res))
                {
                    if (res < indexReader.MaxDoc)
                    {
                        // Since the cache is shared with DTR instances allocated from
                        // doOpenIfChanged, we need to ensure that the ordinal is one that
                        // this DTR instance recognizes.
                        return(res);
                    }
                    else
                    {
                        // if we get here, it means that the category was found in the cache,
                        // but is not recognized by this TR instance. Therefore there's no
                        // need to continue search for the path on disk, because we won't find
                        // it there too.
                        return(TaxonomyReader.INVALID_ORDINAL);
                    }
                }
            }
            finally
            {
                ordinalCacheLock.ExitReadLock();
            }

            // If we're still here, we have a cache miss. We need to fetch the
            // value from disk, and then also put it in the cache:
            int      ret  = TaxonomyReader.INVALID_ORDINAL;
            DocsEnum docs = MultiFields.GetTermDocsEnum(indexReader, null, Consts.FULL, new BytesRef(FacetsConfig.PathToString(cp.Components, cp.Length)), 0);

            if (docs != null && docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
            {
                ret = docs.DocID;

                // we only store the fact that a category exists, not its inexistence.
                // This is required because the caches are shared with new DTR instances
                // that are allocated from doOpenIfChanged. Therefore, if we only store
                // information about found categories, we cannot accidently tell a new
                // generation of DTR that a category does not exist.

                ordinalCacheLock.EnterWriteLock();
                try
                {
                    ordinalCache[cp] = ret;
                }
                finally
                {
                    ordinalCacheLock.ExitWriteLock();
                }
            }

            return(ret);
        }

Exemplo n.º 22

0

Exibir arquivo

Arquivo: TestShardSearching.cs Projeto: zhangbo27/lucenenet

        public virtual void TestSimple()
        {
            int numNodes = TestUtil.NextInt(Random(), 1, 10);

            double runTimeSec = AtLeast(3);

            int minDocsToMakeTerms = TestUtil.NextInt(Random(), 5, 20);

            int maxSearcherAgeSeconds = TestUtil.NextInt(Random(), 1, 3);

            if (VERBOSE)
            {
                Console.WriteLine("TEST: numNodes=" + numNodes + " runTimeSec=" + runTimeSec + " maxSearcherAgeSeconds=" + maxSearcherAgeSeconds);
            }

            Start(numNodes, runTimeSec, maxSearcherAgeSeconds);

            List <PreviousSearchState> priorSearches = new List <PreviousSearchState>();
            List <BytesRef>            terms         = null;

            while (Time.NanoTime() < endTimeNanos)
            {
                bool doFollowon = priorSearches.Count > 0 && Random().Next(7) == 1;

                // Pick a random node; we will run the query on this node:
                int myNodeID = Random().Next(numNodes);

                NodeState.ShardIndexSearcher localShardSearcher;

                PreviousSearchState prevSearchState;

                if (doFollowon)
                {
                    // Pretend user issued a followon query:
                    prevSearchState = priorSearches[Random().Next(priorSearches.Count)];

                    if (VERBOSE)
                    {
                        Console.WriteLine("\nTEST: follow-on query age=" + ((Time.NanoTime() - prevSearchState.SearchTimeNanos) / 1000000000.0));
                    }

                    try
                    {
                        localShardSearcher = Nodes[myNodeID].Acquire(prevSearchState.Versions);
                    }
                    catch (SearcherExpiredException see)
                    {
                        // Expected, sometimes; in a "real" app we would
                        // either forward this error to the user ("too
                        // much time has passed; please re-run your
                        // search") or sneakily just switch to newest
                        // searcher w/o telling them...
                        if (VERBOSE)
                        {
                            Console.WriteLine("  searcher expired during local shard searcher init: " + see);
                        }
                        priorSearches.Remove(prevSearchState);
                        continue;
                    }
                }
                else
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine("\nTEST: fresh query");
                    }
                    // Do fresh query:
                    localShardSearcher = Nodes[myNodeID].Acquire();
                    prevSearchState    = null;
                }

                IndexReader[] subs = new IndexReader[numNodes];

                PreviousSearchState searchState = null;

                try
                {
                    // Mock: now make a single reader (MultiReader) from all node
                    // searchers.  In a real shard env you can't do this... we
                    // do it to confirm results from the shard searcher
                    // are correct:
                    int docCount = 0;
                    try
                    {
                        for (int nodeID = 0; nodeID < numNodes; nodeID++)
                        {
                            long          subVersion = localShardSearcher.NodeVersions[nodeID];
                            IndexSearcher sub        = Nodes[nodeID].Searchers.Acquire(subVersion);
                            if (sub == null)
                            {
                                nodeID--;
                                while (nodeID >= 0)
                                {
                                    subs[nodeID].DecRef();
                                    subs[nodeID] = null;
                                    nodeID--;
                                }
                                throw new SearcherExpiredException("nodeID=" + nodeID + " version=" + subVersion);
                            }
                            subs[nodeID] = sub.IndexReader;
                            docCount    += subs[nodeID].MaxDoc;
                        }
                    }
                    catch (SearcherExpiredException see)
                    {
                        // Expected
                        if (VERBOSE)
                        {
                            Console.WriteLine("  searcher expired during mock reader init: " + see);
                        }
                        continue;
                    }

                    IndexReader   mockReader   = new MultiReader(subs);
                    IndexSearcher mockSearcher = new IndexSearcher(mockReader);

                    Query query;
                    Sort  sort;

                    if (prevSearchState != null)
                    {
                        query = prevSearchState.Query;
                        sort  = prevSearchState.Sort;
                    }
                    else
                    {
                        if (terms == null && docCount > minDocsToMakeTerms)
                        {
                            // TODO: try to "focus" on high freq terms sometimes too
                            // TODO: maybe also periodically reset the terms...?
                            TermsEnum termsEnum = MultiFields.GetTerms(mockReader, "body").GetIterator(null);
                            terms = new List <BytesRef>();
                            while (termsEnum.Next() != null)
                            {
                                terms.Add(BytesRef.DeepCopyOf(termsEnum.Term));
                            }
                            if (VERBOSE)
                            {
                                Console.WriteLine("TEST: init terms: " + terms.Count + " terms");
                            }
                            if (terms.Count == 0)
                            {
                                terms = null;
                            }
                        }

                        if (VERBOSE)
                        {
                            Console.WriteLine("  maxDoc=" + mockReader.MaxDoc);
                        }

                        if (terms != null)
                        {
                            if (Random().NextBoolean())
                            {
                                query = new TermQuery(new Term("body", terms[Random().Next(terms.Count)]));
                            }
                            else
                            {
                                string t = terms[Random().Next(terms.Count)].Utf8ToString();
                                string prefix;
                                if (t.Length <= 1)
                                {
                                    prefix = t;
                                }
                                else
                                {
                                    prefix = t.Substring(0, TestUtil.NextInt(Random(), 1, 2));
                                }
                                query = new PrefixQuery(new Term("body", prefix));
                            }

                            if (Random().NextBoolean())
                            {
                                sort = null;
                            }
                            else
                            {
                                // TODO: sort by more than 1 field
                                int what = Random().Next(3);
                                if (what == 0)
                                {
                                    sort = new Sort(SortField.FIELD_SCORE);
                                }
                                else if (what == 1)
                                {
                                    // TODO: this sort doesn't merge
                                    // correctly... it's tricky because you
                                    // could have > 2.1B docs across all shards:
                                    //sort = new Sort(SortField.FIELD_DOC);
                                    sort = null;
                                }
                                else if (what == 2)
                                {
                                    sort = new Sort(new SortField[] { new SortField("docid", SortFieldType.INT32, Random().NextBoolean()) });
                                }
                                else
                                {
                                    sort = new Sort(new SortField[] { new SortField("title", SortFieldType.STRING, Random().NextBoolean()) });
                                }
                            }
                        }
                        else
                        {
                            query = null;
                            sort  = null;
                        }
                    }

                    if (query != null)
                    {
                        try
                        {
                            searchState = AssertSame(mockSearcher, localShardSearcher, query, sort, prevSearchState);
                        }
                        catch (SearcherExpiredException see)
                        {
                            // Expected; in a "real" app we would
                            // either forward this error to the user ("too
                            // much time has passed; please re-run your
                            // search") or sneakily just switch to newest
                            // searcher w/o telling them...
                            if (VERBOSE)
                            {
                                Console.WriteLine("  searcher expired during search: " + see);
                                Console.Out.Write(see.StackTrace);
                            }
                            // We can't do this in general: on a very slow
                            // computer it's possible the local searcher
                            // expires before we can finish our search:
                            // assert prevSearchState != null;
                            if (prevSearchState != null)
                            {
                                priorSearches.Remove(prevSearchState);
                            }
                        }
                    }
                }
                finally
                {
                    Nodes[myNodeID].Release(localShardSearcher);
                    foreach (IndexReader sub in subs)
                    {
                        if (sub != null)
                        {
                            sub.DecRef();
                        }
                    }
                }

                if (searchState != null && searchState.SearchAfterLocal != null && Random().Next(5) == 3)
                {
                    priorSearches.Add(searchState);
                    if (priorSearches.Count > 200)
                    {
                        Collections.Shuffle(priorSearches);
                        priorSearches.SubList(100, priorSearches.Count).Clear();
                    }
                }
            }

            Finish();
        }

Exemplo n.º 23

0

Exibir arquivo

        public virtual void Test10kPulsed()
        {
            // we always run this test with pulsing codec.
            Codec cp = TestUtil.AlwaysPostingsFormat(new Pulsing41PostingsFormat(1));

            DirectoryInfo        f   = CreateTempDir("10kpulsed");
            BaseDirectoryWrapper dir = NewFSDirectory(f);

            dir.CheckIndexOnDispose = false; // we do this ourselves explicitly
            RandomIndexWriter iw = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetCodec(cp));

            Document  document = new Document();
            FieldType ft       = new FieldType(TextField.TYPE_STORED);

            switch (TestUtil.NextInt32(Random, 0, 2))
            {
            case 0:
                ft.IndexOptions = IndexOptions.DOCS_ONLY;
                break;

            case 1:
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS;
                break;

            default:
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
                break;
            }

            Field field = NewField("field", "", ft);

            document.Add(field);

            //NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT));

            for (int i = 0; i < 10050; i++)
            {
                //field.StringValue = df.format(i);
                field.SetStringValue(i.ToString("00000", CultureInfo.InvariantCulture));
                iw.AddDocument(document);
            }

            IndexReader ir = iw.GetReader();

            iw.Dispose();

            TermsEnum te = MultiFields.GetTerms(ir, "field").GetIterator(null);
            DocsEnum  de = null;

            for (int i = 0; i < 10050; i++)
            {
                //string expected = df.format(i);
                string expected = i.ToString("00000", CultureInfo.InvariantCulture);
                assertEquals(expected, te.Next().Utf8ToString());
                de = TestUtil.Docs(Random, te, null, de, DocsFlags.NONE);
                assertTrue(de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.NextDoc());
            }
            ir.Dispose();

            TestUtil.CheckIndex(dir);
            dir.Dispose();
        }

Exemplo n.º 24

0

Exibir arquivo

Arquivo: Test10KPulsings.cs Projeto: willCode2Surf/lucenenet

//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void test10kPulsed() throws Exception
        public virtual void test10kPulsed()
        {
            // we always run this test with pulsing codec.
            Codec cp = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1));

            File f = createTempDir("10kpulsed");
            BaseDirectoryWrapper dir = newFSDirectory(f);

            dir.CheckIndexOnClose = false;     // we do this ourselves explicitly
            RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp));

            Document  document = new Document();
            FieldType ft       = new FieldType(TextField.TYPE_STORED);

            switch (TestUtil.Next(random(), 0, 2))
            {
            case 0:
                ft.IndexOptions = IndexOptions.DOCS_ONLY;
                break;

            case 1:
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS;
                break;

            default:
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
                break;
            }

            Field field = newField("field", "", ft);

            document.add(field);

            NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT));

            for (int i = 0; i < 10050; i++)
            {
                field.StringValue = df.format(i);
                iw.addDocument(document);
            }

            IndexReader ir = iw.Reader;

            iw.close();

            TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null);
            DocsEnum  de = null;

            for (int i = 0; i < 10050; i++)
            {
                string expected = df.format(i);
                assertEquals(expected, te.next().utf8ToString());
                de = TestUtil.docs(random(), te, null, de, DocsEnum.FLAG_NONE);
                assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc());
            }
            ir.close();

            TestUtil.checkIndex(dir);
            dir.close();
        }

Exemplo n.º 25

0

Exibir arquivo

        public virtual void TestPhrasePrefix()
        {
            Directory         indexStore = NewDirectory();
            RandomIndexWriter writer     = new RandomIndexWriter(Random(), indexStore);

            Add("blueberry pie", writer);
            Add("blueberry strudel", writer);
            Add("blueberry pizza", writer);
            Add("blueberry chewing gum", writer);
            Add("bluebird pizza", writer);
            Add("bluebird foobar pizza", writer);
            Add("piccadilly circus", writer);

            IndexReader   reader   = writer.Reader;
            IndexSearcher searcher = NewSearcher(reader);

            // search for "blueberry pi*":
            MultiPhraseQuery query1 = new MultiPhraseQuery();
            // search for "strawberry pi*":
            MultiPhraseQuery query2 = new MultiPhraseQuery();

            query1.Add(new Term("body", "blueberry"));
            query2.Add(new Term("body", "strawberry"));

            LinkedList <Term> termsWithPrefix = new LinkedList <Term>();

            // this TermEnum gives "piccadilly", "pie" and "pizza".
            string    prefix = "pi";
            TermsEnum te     = MultiFields.GetFields(reader).Terms("body").Iterator(null);

            te.SeekCeil(new BytesRef(prefix));
            do
            {
                string s = te.Term().Utf8ToString();
                if (s.StartsWith(prefix))
                {
                    termsWithPrefix.AddLast(new Term("body", s));
                }
                else
                {
                    break;
                }
            } while (te.Next() != null);

            query1.Add(termsWithPrefix.ToArray(/*new Term[0]*/));
            Assert.AreEqual("body:\"blueberry (piccadilly pie pizza)\"", query1.ToString());
            query2.Add(termsWithPrefix.ToArray(/*new Term[0]*/));
            Assert.AreEqual("body:\"strawberry (piccadilly pie pizza)\"", query2.ToString());

            ScoreDoc[] result;
            result = searcher.Search(query1, null, 1000).ScoreDocs;
            Assert.AreEqual(2, result.Length);
            result = searcher.Search(query2, null, 1000).ScoreDocs;
            Assert.AreEqual(0, result.Length);

            // search for "blue* pizza":
            MultiPhraseQuery query3 = new MultiPhraseQuery();

            termsWithPrefix.Clear();
            prefix = "blue";
            te.SeekCeil(new BytesRef(prefix));

            do
            {
                if (te.Term().Utf8ToString().StartsWith(prefix))
                {
                    termsWithPrefix.AddLast(new Term("body", te.Term().Utf8ToString()));
                }
            } while (te.Next() != null);

            query3.Add(termsWithPrefix.ToArray(/*new Term[0]*/));
            query3.Add(new Term("body", "pizza"));

            result = searcher.Search(query3, null, 1000).ScoreDocs;
            Assert.AreEqual(2, result.Length); // blueberry pizza, bluebird pizza
            Assert.AreEqual("body:\"(blueberry bluebird) pizza\"", query3.ToString());

            // test slop:
            query3.Slop = 1;
            result      = searcher.Search(query3, null, 1000).ScoreDocs;

            // just make sure no exc:
            searcher.Explain(query3, 0);

            Assert.AreEqual(3, result.Length); // blueberry pizza, bluebird pizza, bluebird
            // foobar pizza

            MultiPhraseQuery query4 = new MultiPhraseQuery();

            try
            {
                query4.Add(new Term("field1", "foo"));
                query4.Add(new Term("field2", "foobar"));
                Assert.Fail();
            }
            catch (System.ArgumentException e)
            {
                // okay, all terms must belong to the same field
            }

            writer.Dispose();
            reader.Dispose();
            indexStore.Dispose();
        }

Exemplo n.º 26

0

Exibir arquivo

Arquivo: TestSurrogates.cs Projeto: ywscr/lucenenet

        private void DoTestSeekDoesNotExist(Random r, int numField, IList <Term> fieldTerms, Term[] fieldTermsArray, IndexReader reader)
        {
            IDictionary <string, TermsEnum> tes = new Dictionary <string, TermsEnum>();

            if (Verbose)
            {
                Console.WriteLine("TEST: top random seeks");
            }

            {
                int num = AtLeast(100);
                for (int iter = 0; iter < num; iter++)
                {
                    // seek to random spot
                    string field = ("f" + r.Next(numField)).Intern();
                    Term   tx    = new Term(field, GetRandomString(r));

                    int spot = Array.BinarySearch(fieldTermsArray, tx);

                    if (spot < 0)
                    {
                        if (Verbose)
                        {
                            Console.WriteLine("TEST: non-exist seek to " + field + ":" + UnicodeUtil.ToHexString(tx.Text));
                        }

                        // term does not exist:
                        if (!tes.TryGetValue(field, out TermsEnum te))
                        {
                            te         = MultiFields.GetTerms(reader, field).GetEnumerator();
                            tes[field] = te;
                        }

                        if (Verbose)
                        {
                            Console.WriteLine("  got enum");
                        }

                        spot = -spot - 1;

                        if (spot == fieldTerms.Count || !fieldTerms[spot].Field.Equals(field, StringComparison.Ordinal))
                        {
                            Assert.AreEqual(TermsEnum.SeekStatus.END, te.SeekCeil(tx.Bytes));
                        }
                        else
                        {
                            Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, te.SeekCeil(tx.Bytes));

                            if (Verbose)
                            {
                                Console.WriteLine("  got term=" + UnicodeUtil.ToHexString(te.Term.Utf8ToString()));
                                Console.WriteLine("  exp term=" + UnicodeUtil.ToHexString(fieldTerms[spot].Text));
                            }

                            Assert.AreEqual(fieldTerms[spot].Bytes, te.Term);

                            // now .next() this many times:
                            int ct = TestUtil.NextInt32(r, 5, 100);
                            for (int i = 0; i < ct; i++)
                            {
                                if (Verbose)
                                {
                                    Console.WriteLine("TEST: now next()");
                                }
                                if (1 + spot + i >= fieldTerms.Count)
                                {
                                    break;
                                }
                                Term term = fieldTerms[1 + spot + i];
                                if (!term.Field.Equals(field, StringComparison.Ordinal))
                                {
                                    Assert.IsFalse(te.MoveNext());
                                    break;
                                }
                                else
                                {
                                    Assert.IsTrue(te.MoveNext());
                                    BytesRef t = te.Term;

                                    if (Verbose)
                                    {
                                        Console.WriteLine("  got term=" + (t == null ? null : UnicodeUtil.ToHexString(t.Utf8ToString())));
                                        Console.WriteLine("       exp=" + UnicodeUtil.ToHexString(term.Text.ToString()));
                                    }

                                    Assert.AreEqual(term.Bytes, t);
                                }
                            }
                        }
                    }
                }
            }
        }

Exemplo n.º 27

0

Exibir arquivo

        public virtual void TestPhrasePrefix()
        {
            Directory         indexStore = NewDirectory();
            RandomIndexWriter writer     = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, indexStore);
            Document doc1 = new Document();
            Document doc2 = new Document();
            Document doc3 = new Document();
            Document doc4 = new Document();
            Document doc5 = new Document();

            doc1.Add(NewTextField("body", "blueberry pie", Field.Store.YES));
            doc2.Add(NewTextField("body", "blueberry strudel", Field.Store.YES));
            doc3.Add(NewTextField("body", "blueberry pizza", Field.Store.YES));
            doc4.Add(NewTextField("body", "blueberry chewing gum", Field.Store.YES));
            doc5.Add(NewTextField("body", "piccadilly circus", Field.Store.YES));
            writer.AddDocument(doc1);
            writer.AddDocument(doc2);
            writer.AddDocument(doc3);
            writer.AddDocument(doc4);
            writer.AddDocument(doc5);
            IndexReader reader = writer.GetReader();

            writer.Dispose();

            IndexSearcher searcher = NewSearcher(reader);

            // PhrasePrefixQuery query1 = new PhrasePrefixQuery();
            MultiPhraseQuery query1 = new MultiPhraseQuery();
            // PhrasePrefixQuery query2 = new PhrasePrefixQuery();
            MultiPhraseQuery query2 = new MultiPhraseQuery();

            query1.Add(new Term("body", "blueberry"));
            query2.Add(new Term("body", "strawberry"));

            LinkedList <Term> termsWithPrefix = new LinkedList <Term>();

            // this TermEnum gives "piccadilly", "pie" and "pizza".
            string    prefix = "pi";
            TermsEnum te     = MultiFields.GetFields(reader).GetTerms("body").GetIterator(null);

            te.SeekCeil(new BytesRef(prefix));
            do
            {
                string s = te.Term.Utf8ToString();
                if (s.StartsWith(prefix, StringComparison.Ordinal))
                {
                    termsWithPrefix.AddLast(new Term("body", s));
                }
                else
                {
                    break;
                }
            } while (te.Next() != null);

            query1.Add(termsWithPrefix.ToArray(/*new Term[0]*/));
            query2.Add(termsWithPrefix.ToArray(/*new Term[0]*/));

            ScoreDoc[] result;
            result = searcher.Search(query1, null, 1000).ScoreDocs;
            Assert.AreEqual(2, result.Length);

            result = searcher.Search(query2, null, 1000).ScoreDocs;
            Assert.AreEqual(0, result.Length);
            reader.Dispose();
            indexStore.Dispose();
        }

Exemplo n.º 28

0

Exibir arquivo

        private void AddTerms(IndexReader reader, FieldVals f)
        {
            if (f.queryString is null)
            {
                return;
            }
            Terms terms = MultiFields.GetTerms(reader, f.fieldName);

            if (terms is null)
            {
                return;
            }
            TokenStream ts = analyzer.GetTokenStream(f.fieldName, f.queryString);

            try
            {
                ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>();

                int           corpusNumDocs  = reader.NumDocs;
                ISet <string> processedTerms = new JCG.HashSet <string>();
                ts.Reset();
                while (ts.IncrementToken())
                {
                    string term = termAtt.ToString();
                    if (!processedTerms.Contains(term))
                    {
                        processedTerms.Add(term);
                        ScoreTermQueue  variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                        float           minScore  = 0;
                        Term            startTerm = new Term(f.fieldName, term);
                        AttributeSource atts      = new AttributeSource();
                        IMaxNonCompetitiveBoostAttribute maxBoostAtt =
                            atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
#pragma warning disable 612, 618
                        SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
#pragma warning restore 612, 618
                        //store the df so all variants use same idf
                        int             df                   = reader.DocFreq(startTerm);
                        int             numVariants          = 0;
                        int             totalVariantDocFreqs = 0;
                        BytesRef        possibleMatch;
                        IBoostAttribute boostAtt =
                            fe.Attributes.AddAttribute <IBoostAttribute>();
                        while (fe.MoveNext())
                        {
                            possibleMatch = fe.Term;
                            numVariants++;
                            totalVariantDocFreqs += fe.DocFreq;
                            float score = boostAtt.Boost;
                            if (variantsQ.Count < MAX_VARIANTS_PER_TERM || score > minScore)
                            {
                                ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm);
                                variantsQ.InsertWithOverflow(st);
                                minScore = variantsQ.Top.Score; // maintain minScore
                            }
                            maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Count >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity;
                        }

                        if (numVariants > 0)
                        {
                            int avgDf = totalVariantDocFreqs / numVariants;
                            if (df == 0)    //no direct match we can use as df for all variants
                            {
                                df = avgDf; //use avg df of all variants
                            }

                            // take the top variants (scored by edit distance) and reset the score
                            // to include an IDF factor then add to the global queue for ranking
                            // overall top query terms
                            int size = variantsQ.Count;
                            for (int i = 0; i < size; i++)
                            {
                                ScoreTerm st = variantsQ.Pop();
                                st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs);
                                q.InsertWithOverflow(st);
                            }
                        }
                    }
                }
                ts.End();
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }

Exemplo n.º 29

0

Exibir arquivo

Arquivo: ThreadedIndexingAndSearchingTestCase.cs Projeto: sean-gilliam/lucenenet

 public override void Run()
 {
     if (Verbose)
     {
         Console.WriteLine(Thread.CurrentThread.Name + ": launch search thread");
     }
     while (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond < stopTimeMS) // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results
     {
         try
         {
             IndexSearcher s = outerInstance.GetCurrentSearcher();
             try
             {
                 // Verify 1) IW is correctly setting
                 // diagnostics, and 2) segment warming for
                 // merged segments is actually happening:
                 foreach (AtomicReaderContext sub in s.IndexReader.Leaves)
                 {
                     SegmentReader segReader = (SegmentReader)sub.Reader;
                     IDictionary <string, string> diagnostics = segReader.SegmentInfo.Info.Diagnostics;
                     assertNotNull(diagnostics);
                     diagnostics.TryGetValue("source", out string source);
                     assertNotNull(source);
                     if (source.Equals("merge", StringComparison.Ordinal))
                     {
                         assertTrue("sub reader " + sub + " wasn't warmed: warmed=" + outerInstance.warmed + " diagnostics=" + diagnostics + " si=" + segReader.SegmentInfo,
                                    // LUCENENET: ConditionalWeakTable doesn't have ContainsKey, so we normalize to TryGetValue
                                    !outerInstance.m_assertMergedSegmentsWarmed || outerInstance.warmed.TryGetValue(segReader.core, out BooleanRef _));
                     }
                 }
                 if (s.IndexReader.NumDocs > 0)
                 {
                     outerInstance.SmokeTestSearcher(s);
                     Fields fields = MultiFields.GetFields(s.IndexReader);
                     if (fields == null)
                     {
                         continue;
                     }
                     Terms terms = fields.GetTerms("body");
                     if (terms == null)
                     {
                         continue;
                     }
                     TermsEnum termsEnum     = terms.GetEnumerator();
                     int       seenTermCount = 0;
                     int       shift;
                     int       trigger;
                     if (totTermCount < 30)
                     {
                         shift   = 0;
                         trigger = 1;
                     }
                     else
                     {
                         trigger = totTermCount / 30;
                         shift   = Random.Next(trigger);
                     }
                     while (J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond < stopTimeMS) // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results
                     {
                         if (!termsEnum.MoveNext())
                         {
                             totTermCount.Value = seenTermCount;
                             break;
                         }
                         seenTermCount++;
                         // search 30 terms
                         if ((seenTermCount + shift) % trigger == 0)
                         {
                             //if (VERBOSE) {
                             //System.out.println(Thread.currentThread().getName() + " now search body:" + term.Utf8ToString());
                             //}
                             totHits.AddAndGet(outerInstance.RunQuery(s, new TermQuery(new Term("body", termsEnum.Term))));
                         }
                     }
                     //if (VERBOSE) {
                     //System.out.println(Thread.currentThread().getName() + ": search done");
                     //}
                 }
             }
             finally
             {
                 outerInstance.ReleaseSearcher(s);
             }
         }
         catch (Exception t) when(t.IsThrowable())
         {
             Console.WriteLine(Thread.CurrentThread.Name + ": hit exc");
             outerInstance.m_failed.Value = (true);
             Console.WriteLine(t.ToString());
             throw RuntimeException.Create(t);
         }
     }
 }

Exemplos de MultiFields em C# (CSharp)