Beispiel #1
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in C#:
//ORIGINAL LINE: public org.neo4j.storageengine.api.schema.IndexSample sampleIndex() throws org.neo4j.internal.kernel.api.exceptions.schema.IndexNotFoundKernelException
        public override IndexSample SampleIndex()
        {
            NonUniqueIndexSampler sampler     = new DefaultNonUniqueIndexSampler(_indexSamplingConfig.sampleSizeLimit());
            IndexReader           indexReader = _indexSearcher.IndexReader;

            foreach (LeafReaderContext readerContext in indexReader.leaves())
            {
                try
                {
                    ISet <string> fieldNames = GetFieldNamesToSample(readerContext);
                    foreach (string fieldName in fieldNames)
                    {
                        Terms terms = readerContext.reader().terms(fieldName);
                        if (terms != null)
                        {
                            TermsEnum termsEnum = LuceneDocumentStructure.originalTerms(terms, fieldName);
                            BytesRef  termsRef;
                            while ((termsRef = termsEnum.next()) != null)
                            {
                                sampler.Include(termsRef.utf8ToString(), termsEnum.docFreq());
                                CheckCancellation();
                            }
                        }
                    }
                }
                catch (IOException e)
                {
                    throw new Exception(e);
                }
            }

            return(sampler.Result(indexReader.numDocs()));
        }
Beispiel #2
0
        /// <summary>
        /// tests reuse with Pulsing1(Pulsing2(Standard)) </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testNestedPulsing() throws Exception
        public virtual void testNestedPulsing()
        {
            // we always run this test with pulsing codec.
            Codec cp = TestUtil.alwaysPostingsFormat(new NestedPulsingPostingsFormat());
            BaseDirectoryWrapper dir = newDirectory();
            RandomIndexWriter    iw  = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp));
            Document             doc = new Document();

            doc.add(new TextField("foo", "a b b c c c d e f g g g h i i j j k l l m m m", Field.Store.NO));
            // note: the reuse is imperfect, here we would have 4 enums (lost reuse when we get an enum for 'm')
            // this is because we only track the 'last' enum we reused (not all).
            // but this seems 'good enough' for now.
            iw.addDocument(doc);
            DirectoryReader ir = iw.Reader;

            iw.close();

            AtomicReader segment = getOnlySegmentReader(ir);
            DocsEnum     reuse   = null;
            IDictionary <DocsEnum, bool?> allEnums = new IdentityHashMap <DocsEnum, bool?>();
            TermsEnum te = segment.terms("foo").iterator(null);

            while (te.next() != null)
            {
                reuse           = te.docs(null, reuse, DocsEnum.FLAG_NONE);
                allEnums[reuse] = true;
            }

            assertEquals(4, allEnums.Count);

            allEnums.Clear();
            DocsAndPositionsEnum posReuse = null;

            te = segment.terms("foo").iterator(null);
            while (te.next() != null)
            {
                posReuse           = te.docsAndPositions(null, posReuse);
                allEnums[posReuse] = true;
            }

            assertEquals(4, allEnums.Count);

            ir.close();
            dir.close();
        }
Beispiel #3
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: protected void fill(String field, org.apache.lucene.index.TermsEnum termsEnum) throws java.io.IOException
            protected internal void fill(string field, TermsEnum termsEnum)
            {
                BytesRef term = null;

                while ((term = termsEnum.next()) != null)
                {
                    insertWithOverflow(new TermStats(field, term, termsEnum.docFreq(), termsEnum.totalTermFreq()));
                }
            }
Beispiel #4
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in C#:
//ORIGINAL LINE: private static org.apache.lucene.index.Terms getTerms(String value, int frequency) throws java.io.IOException
        private static Terms GetTerms(string value, int frequency)
        {
            TermsEnum termsEnum = mock(typeof(TermsEnum));
            Terms     terms     = mock(typeof(Terms));

            when(terms.GetEnumerator()).thenReturn(termsEnum);
            when(termsEnum.next()).thenReturn(new BytesRef(value.GetBytes())).thenReturn(null);
            when(termsEnum.docFreq()).thenReturn(frequency);
            return(terms);
        }
Beispiel #5
0
        // TODO: this is a basic test. this thing is complicated, add more
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testSophisticatedReuse() throws Exception
        public virtual void testSophisticatedReuse()
        {
            // we always run this test with pulsing codec.
            Codec             cp  = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1));
            Directory         dir = newDirectory();
            RandomIndexWriter iw  = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp));
            Document          doc = new Document();

            doc.add(new TextField("foo", "a b b c c c d e f g g h i i j j k", Field.Store.NO));
            iw.addDocument(doc);
            DirectoryReader ir = iw.Reader;

            iw.close();

            AtomicReader segment = getOnlySegmentReader(ir);
            DocsEnum     reuse   = null;
            IDictionary <DocsEnum, bool?> allEnums = new IdentityHashMap <DocsEnum, bool?>();
            TermsEnum te = segment.terms("foo").iterator(null);

            while (te.next() != null)
            {
                reuse           = te.docs(null, reuse, DocsEnum.FLAG_NONE);
                allEnums[reuse] = true;
            }

            assertEquals(2, allEnums.Count);

            allEnums.Clear();
            DocsAndPositionsEnum posReuse = null;

            te = segment.terms("foo").iterator(null);
            while (te.next() != null)
            {
                posReuse           = te.docsAndPositions(null, posReuse);
                allEnums[posReuse] = true;
            }

            assertEquals(2, allEnums.Count);

            ir.close();
            dir.close();
        }
Beispiel #6
0
        // LUCENE-1448
        // TODO: instead of testing it this way, we can test
        // with BaseTokenStreamTestCase now...
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception
        public virtual void testEndOffsetPositionWithTeeSinkTokenFilter()
        {
            Directory          dir         = newDirectory();
            Analyzer           analyzer    = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
            IndexWriter        w           = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
            Document           doc         = new Document();
            TokenStream        tokenStream = analyzer.tokenStream("field", "abcd   ");
            TeeSinkTokenFilter tee         = new TeeSinkTokenFilter(tokenStream);
            TokenStream        sink        = tee.newSinkTokenStream();
            FieldType          ft          = new FieldType(TextField.TYPE_NOT_STORED);

            ft.StoreTermVectors         = true;
            ft.StoreTermVectorOffsets   = true;
            ft.StoreTermVectorPositions = true;
            Field f1 = new Field("field", tee, ft);
            Field f2 = new Field("field", sink, ft);

            doc.add(f1);
            doc.add(f2);
            w.addDocument(doc);
            w.close();

            IndexReader r      = DirectoryReader.open(dir);
            Terms       vector = r.getTermVectors(0).terms("field");

            assertEquals(1, vector.size());
            TermsEnum termsEnum = vector.iterator(null);

            termsEnum.next();
            assertEquals(2, termsEnum.totalTermFreq());
            DocsAndPositionsEnum positions = termsEnum.docsAndPositions(null, null);

            assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            assertEquals(2, positions.freq());
            positions.nextPosition();
            assertEquals(0, positions.startOffset());
            assertEquals(4, positions.endOffset());
            positions.nextPosition();
            assertEquals(8, positions.startOffset());
            assertEquals(12, positions.endOffset());
            assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
            r.close();
            dir.close();
        }
Beispiel #7
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in C#:
//ORIGINAL LINE: public void verify(org.neo4j.storageengine.api.NodePropertyAccessor accessor, int[] propKeyIds) throws org.neo4j.kernel.api.exceptions.index.IndexEntryConflictException, java.io.IOException
        public override void Verify(NodePropertyAccessor accessor, int[] propKeyIds)
        {
            foreach (string field in AllFields())
            {
                if (LuceneDocumentStructure.useFieldForUniquenessVerification(field))
                {
                    TermsEnum terms = LuceneDocumentStructure.originalTerms(TermsForField(field), field);
                    BytesRef  termsRef;
                    while ((termsRef = terms.next()) != null)
                    {
                        if (terms.docFreq() > 1)
                        {
                            TermQuery query = new TermQuery(new Term(field, termsRef));
                            SearchForDuplicates(query, accessor, propKeyIds, terms.docFreq());
                        }
                    }
                }
            }
        }
Beispiel #8
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in C#:
//ORIGINAL LINE: public void verify(org.neo4j.storageengine.api.NodePropertyAccessor accessor, int[] propKeyIds) throws org.neo4j.kernel.api.exceptions.index.IndexEntryConflictException, java.io.IOException
        public override void Verify(NodePropertyAccessor accessor, int[] propKeyIds)
        {
            try
            {
                DuplicateCheckingCollector collector = DuplicateCheckingCollector.ForProperties(accessor, propKeyIds);
                IndexSearcher searcher = IndexSearcher();
                foreach (LeafReaderContext leafReaderContext in searcher.IndexReader.leaves())
                {
                    Fields fields = leafReaderContext.reader().fields();
                    foreach (string field in fields)
                    {
                        if (LuceneDocumentStructure.useFieldForUniquenessVerification(field))
                        {
                            TermsEnum terms = LuceneDocumentStructure.originalTerms(fields.terms(field), field);
                            BytesRef  termsRef;
                            while ((termsRef = terms.next()) != null)
                            {
                                if (terms.docFreq() > 1)
                                {
                                    collector.Init(terms.docFreq());
                                    searcher.search(new TermQuery(new Term(field, termsRef)), collector);
                                }
                            }
                        }
                    }
                }
            }
            catch (IOException e)
            {
                Exception cause = e.InnerException;
                if (cause is IndexEntryConflictException)
                {
                    throw ( IndexEntryConflictException )cause;
                }
                throw e;
            }
        }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void test10kPulsed() throws Exception
        public virtual void test10kPulsed()
        {
            // we always run this test with pulsing codec.
            Codec cp = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1));

            File f = createTempDir("10kpulsed");
            BaseDirectoryWrapper dir = newFSDirectory(f);

            dir.CheckIndexOnClose = false;     // we do this ourselves explicitly
            RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp));

            Document  document = new Document();
            FieldType ft       = new FieldType(TextField.TYPE_STORED);

            switch (TestUtil.Next(random(), 0, 2))
            {
            case 0:
                ft.IndexOptions = IndexOptions.DOCS_ONLY;
                break;

            case 1:
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS;
                break;

            default:
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
                break;
            }

            Field field = newField("field", "", ft);

            document.add(field);

            NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT));

            for (int i = 0; i < 10050; i++)
            {
                field.StringValue = df.format(i);
                iw.addDocument(document);
            }

            IndexReader ir = iw.Reader;

            iw.close();

            TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null);
            DocsEnum  de = null;

            for (int i = 0; i < 10050; i++)
            {
                string expected = df.format(i);
                assertEquals(expected, te.next().utf8ToString());
                de = TestUtil.docs(random(), te, null, de, DocsEnum.FLAG_NONE);
                assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc());
            }
            ir.close();

            TestUtil.checkIndex(dir);
            dir.close();
        }
        /// <summary>
        /// a variant, that uses pulsing, but uses a high TF to force pass thru to the underlying codec
        /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void test10kNotPulsed() throws Exception
        public virtual void test10kNotPulsed()
        {
            // we always run this test with pulsing codec.
            int   freqCutoff = TestUtil.Next(random(), 1, 10);
            Codec cp         = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(freqCutoff));

            File f = createTempDir("10knotpulsed");
            BaseDirectoryWrapper dir = newFSDirectory(f);

            dir.CheckIndexOnClose = false;     // we do this ourselves explicitly
            RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp));

            Document  document = new Document();
            FieldType ft       = new FieldType(TextField.TYPE_STORED);

            switch (TestUtil.Next(random(), 0, 2))
            {
            case 0:
                ft.IndexOptions = IndexOptions.DOCS_ONLY;
                break;

            case 1:
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS;
                break;

            default:
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
                break;
            }

            Field field = newField("field", "", ft);

            document.add(field);

            NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT));

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int freq = freqCutoff + 1;
            int freq = freqCutoff + 1;

            for (int i = 0; i < 10050; i++)
            {
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < freq; j++)
                {
                    sb.Append(df.format(i));
                    sb.Append(' ');     // whitespace
                }
                field.StringValue = sb.ToString();
                iw.addDocument(document);
            }

            IndexReader ir = iw.Reader;

            iw.close();

            TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null);
            DocsEnum  de = null;

            for (int i = 0; i < 10050; i++)
            {
                string expected = df.format(i);
                assertEquals(expected, te.next().utf8ToString());
                de = TestUtil.docs(random(), te, null, de, DocsEnum.FLAG_NONE);
                assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc());
            }
            ir.close();

            TestUtil.checkIndex(dir);
            dir.close();
        }
Beispiel #11
0
        /// <summary>
        /// Build the suggest index, using up to the specified
        ///  amount of temporary RAM while building.  Note that
        ///  the weights for the suggestions are ignored.
        /// </summary>
        public virtual void Build(InputIterator iterator, double ramBufferSizeMB)
        {
            if (iterator.HasPayloads())
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (iterator.HasContexts())
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            string prefix    = this.GetType().Name;
            var    directory = OfflineSorter.DefaultTempDir();
            // TODO: messy ... java7 has Files.createTempDirectory
            // ... but 4.x is java6:
            File   tempIndexPath = null;
            Random random        = new Random();

            while (true)
            {
                tempIndexPath = new File(directory, prefix + ".index." + random.Next(int.MaxValue));
                if (tempIndexPath.mkdir())
                {
                    break;
                }
            }

            Directory dir = FSDirectory.Open(tempIndexPath);

            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_CURRENT, indexAnalyzer);

            iwc.OpenMode        = IndexWriterConfig.OpenMode.CREATE;
            iwc.RAMBufferSizeMB = ramBufferSizeMB;
            IndexWriter writer = new IndexWriter(dir, iwc);

            FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);

            // TODO: if only we had IndexOptions.TERMS_ONLY...
            ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS;
            ft.OmitNorms    = true;
            ft.Freeze();

            Document doc   = new Document();
            Field    field = new Field("body", "", ft);

            doc.Add(field);

            totTokens = 0;
            IndexReader reader = null;

            bool success = false;

            count = 0;
            try
            {
                while (true)
                {
                    BytesRef surfaceForm = iterator.Next();
                    if (surfaceForm == null)
                    {
                        break;
                    }
                    field.StringValue = surfaceForm.Utf8ToString();
                    writer.AddDocument(doc);
                    count++;
                }
                reader = DirectoryReader.Open(writer, false);

                Terms terms = MultiFields.GetTerms(reader, "body");
                if (terms == null)
                {
                    throw new System.ArgumentException("need at least one suggestion");
                }

                // Move all ngrams into an FST:
                TermsEnum termsEnum = terms.Iterator(null);

                Outputs <long?> outputs = PositiveIntOutputs.Singleton;
                Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs);

                IntsRef scratchInts = new IntsRef();
                while (true)
                {
                    BytesRef term = termsEnum.next();
                    if (term == null)
                    {
                        break;
                    }
                    int ngramCount = countGrams(term);
                    if (ngramCount > grams)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
                    }
                    if (ngramCount == 1)
                    {
                        totTokens += termsEnum.TotalTermFreq();
                    }

                    builder.Add(Util.ToIntsRef(term, scratchInts), encodeWeight(termsEnum.TotalTermFreq()));
                }

                fst = builder.Finish();
                if (fst == null)
                {
                    throw new System.ArgumentException("need at least one suggestion");
                }
                //System.out.println("FST: " + fst.getNodeCount() + " nodes");

                /*
                 * PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
                 * Util.toDot(fst, pw, true, true);
                 * pw.close();
                 */

                success = true;
            }
            finally
            {
                try
                {
                    if (success)
                    {
                        IOUtils.Close(writer, reader);
                    }
                    else
                    {
                        IOUtils.CloseWhileHandlingException(writer, reader);
                    }
                }
                finally
                {
                    foreach (string file in dir.ListAll())
                    {
                        File path = new File(tempIndexPath, file);
                        if (path.Delete() == false)
                        {
                            throw new InvalidOperationException("failed to remove " + path);
                        }
                    }

                    if (tempIndexPath.Delete() == false)
                    {
                        throw new InvalidOperationException("failed to remove " + tempIndexPath);
                    }

                    dir.Dispose();
                }
            }
        }