//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in C#: //ORIGINAL LINE: public org.neo4j.storageengine.api.schema.IndexSample sampleIndex() throws org.neo4j.internal.kernel.api.exceptions.schema.IndexNotFoundKernelException public override IndexSample SampleIndex() { NonUniqueIndexSampler sampler = new DefaultNonUniqueIndexSampler(_indexSamplingConfig.sampleSizeLimit()); IndexReader indexReader = _indexSearcher.IndexReader; foreach (LeafReaderContext readerContext in indexReader.leaves()) { try { ISet <string> fieldNames = GetFieldNamesToSample(readerContext); foreach (string fieldName in fieldNames) { Terms terms = readerContext.reader().terms(fieldName); if (terms != null) { TermsEnum termsEnum = LuceneDocumentStructure.originalTerms(terms, fieldName); BytesRef termsRef; while ((termsRef = termsEnum.next()) != null) { sampler.Include(termsRef.utf8ToString(), termsEnum.docFreq()); CheckCancellation(); } } } } catch (IOException e) { throw new Exception(e); } } return(sampler.Result(indexReader.numDocs())); }
/// <summary> /// tests reuse with Pulsing1(Pulsing2(Standard)) </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testNestedPulsing() throws Exception public virtual void testNestedPulsing() { // we always run this test with pulsing codec. Codec cp = TestUtil.alwaysPostingsFormat(new NestedPulsingPostingsFormat()); BaseDirectoryWrapper dir = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp)); Document doc = new Document(); doc.add(new TextField("foo", "a b b c c c d e f g g g h i i j j k l l m m m", Field.Store.NO)); // note: the reuse is imperfect, here we would have 4 enums (lost reuse when we get an enum for 'm') // this is because we only track the 'last' enum we reused (not all). // but this seems 'good enough' for now. iw.addDocument(doc); DirectoryReader ir = iw.Reader; iw.close(); AtomicReader segment = getOnlySegmentReader(ir); DocsEnum reuse = null; IDictionary <DocsEnum, bool?> allEnums = new IdentityHashMap <DocsEnum, bool?>(); TermsEnum te = segment.terms("foo").iterator(null); while (te.next() != null) { reuse = te.docs(null, reuse, DocsEnum.FLAG_NONE); allEnums[reuse] = true; } assertEquals(4, allEnums.Count); allEnums.Clear(); DocsAndPositionsEnum posReuse = null; te = segment.terms("foo").iterator(null); while (te.next() != null) { posReuse = te.docsAndPositions(null, posReuse); allEnums[posReuse] = true; } assertEquals(4, allEnums.Count); ir.close(); dir.close(); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: protected void fill(String field, org.apache.lucene.index.TermsEnum termsEnum) throws java.io.IOException protected internal void fill(string field, TermsEnum termsEnum) { BytesRef term = null; while ((term = termsEnum.next()) != null) { insertWithOverflow(new TermStats(field, term, termsEnum.docFreq(), termsEnum.totalTermFreq())); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in C#: //ORIGINAL LINE: private static org.apache.lucene.index.Terms getTerms(String value, int frequency) throws java.io.IOException private static Terms GetTerms(string value, int frequency) { TermsEnum termsEnum = mock(typeof(TermsEnum)); Terms terms = mock(typeof(Terms)); when(terms.GetEnumerator()).thenReturn(termsEnum); when(termsEnum.next()).thenReturn(new BytesRef(value.GetBytes())).thenReturn(null); when(termsEnum.docFreq()).thenReturn(frequency); return(terms); }
// TODO: this is a basic test. this thing is complicated, add more //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testSophisticatedReuse() throws Exception public virtual void testSophisticatedReuse() { // we always run this test with pulsing codec. Codec cp = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1)); Directory dir = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp)); Document doc = new Document(); doc.add(new TextField("foo", "a b b c c c d e f g g h i i j j k", Field.Store.NO)); iw.addDocument(doc); DirectoryReader ir = iw.Reader; iw.close(); AtomicReader segment = getOnlySegmentReader(ir); DocsEnum reuse = null; IDictionary <DocsEnum, bool?> allEnums = new IdentityHashMap <DocsEnum, bool?>(); TermsEnum te = segment.terms("foo").iterator(null); while (te.next() != null) { reuse = te.docs(null, reuse, DocsEnum.FLAG_NONE); allEnums[reuse] = true; } assertEquals(2, allEnums.Count); allEnums.Clear(); DocsAndPositionsEnum posReuse = null; te = segment.terms("foo").iterator(null); while (te.next() != null) { posReuse = te.docsAndPositions(null, posReuse); allEnums[posReuse] = true; } assertEquals(2, allEnums.Count); ir.close(); dir.close(); }
// LUCENE-1448 // TODO: instead of testing it this way, we can test // with BaseTokenStreamTestCase now... //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception public virtual void testEndOffsetPositionWithTeeSinkTokenFilter() { Directory dir = newDirectory(); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); TokenStream tokenStream = analyzer.tokenStream("field", "abcd "); TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream); TokenStream sink = tee.newSinkTokenStream(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; ft.StoreTermVectorPositions = true; Field f1 = new Field("field", tee, ft); Field f2 = new Field("field", sink, ft); doc.add(f1); doc.add(f2); w.addDocument(doc); w.close(); IndexReader r = DirectoryReader.open(dir); Terms vector = r.getTermVectors(0).terms("field"); assertEquals(1, vector.size()); TermsEnum termsEnum = vector.iterator(null); termsEnum.next(); assertEquals(2, termsEnum.totalTermFreq()); DocsAndPositionsEnum positions = termsEnum.docsAndPositions(null, null); assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(2, positions.freq()); positions.nextPosition(); assertEquals(0, positions.startOffset()); assertEquals(4, positions.endOffset()); positions.nextPosition(); assertEquals(8, positions.startOffset()); assertEquals(12, positions.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc()); r.close(); dir.close(); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in C#: //ORIGINAL LINE: public void verify(org.neo4j.storageengine.api.NodePropertyAccessor accessor, int[] propKeyIds) throws org.neo4j.kernel.api.exceptions.index.IndexEntryConflictException, java.io.IOException public override void Verify(NodePropertyAccessor accessor, int[] propKeyIds) { foreach (string field in AllFields()) { if (LuceneDocumentStructure.useFieldForUniquenessVerification(field)) { TermsEnum terms = LuceneDocumentStructure.originalTerms(TermsForField(field), field); BytesRef termsRef; while ((termsRef = terms.next()) != null) { if (terms.docFreq() > 1) { TermQuery query = new TermQuery(new Term(field, termsRef)); SearchForDuplicates(query, accessor, propKeyIds, terms.docFreq()); } } } } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in C#: //ORIGINAL LINE: public void verify(org.neo4j.storageengine.api.NodePropertyAccessor accessor, int[] propKeyIds) throws org.neo4j.kernel.api.exceptions.index.IndexEntryConflictException, java.io.IOException public override void Verify(NodePropertyAccessor accessor, int[] propKeyIds) { try { DuplicateCheckingCollector collector = DuplicateCheckingCollector.ForProperties(accessor, propKeyIds); IndexSearcher searcher = IndexSearcher(); foreach (LeafReaderContext leafReaderContext in searcher.IndexReader.leaves()) { Fields fields = leafReaderContext.reader().fields(); foreach (string field in fields) { if (LuceneDocumentStructure.useFieldForUniquenessVerification(field)) { TermsEnum terms = LuceneDocumentStructure.originalTerms(fields.terms(field), field); BytesRef termsRef; while ((termsRef = terms.next()) != null) { if (terms.docFreq() > 1) { collector.Init(terms.docFreq()); searcher.search(new TermQuery(new Term(field, termsRef)), collector); } } } } } } catch (IOException e) { Exception cause = e.InnerException; if (cause is IndexEntryConflictException) { throw ( IndexEntryConflictException )cause; } throw e; } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void test10kPulsed() throws Exception public virtual void test10kPulsed() { // we always run this test with pulsing codec. Codec cp = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1)); File f = createTempDir("10kpulsed"); BaseDirectoryWrapper dir = newFSDirectory(f); dir.CheckIndexOnClose = false; // we do this ourselves explicitly RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp)); Document document = new Document(); FieldType ft = new FieldType(TextField.TYPE_STORED); switch (TestUtil.Next(random(), 0, 2)) { case 0: ft.IndexOptions = IndexOptions.DOCS_ONLY; break; case 1: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; break; default: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; break; } Field field = newField("field", "", ft); document.add(field); NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT)); for (int i = 0; i < 10050; i++) { field.StringValue = df.format(i); iw.addDocument(document); } IndexReader ir = iw.Reader; iw.close(); TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null); DocsEnum de = null; for (int i = 0; i < 10050; i++) { string expected = df.format(i); assertEquals(expected, te.next().utf8ToString()); de = TestUtil.docs(random(), te, null, de, DocsEnum.FLAG_NONE); assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc()); } ir.close(); TestUtil.checkIndex(dir); dir.close(); }
/// <summary> /// a variant, that uses pulsing, but uses a high TF to force pass thru to the underlying codec /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void test10kNotPulsed() throws Exception public virtual void test10kNotPulsed() { // we always run this test with pulsing codec. int freqCutoff = TestUtil.Next(random(), 1, 10); Codec cp = TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(freqCutoff)); File f = createTempDir("10knotpulsed"); BaseDirectoryWrapper dir = newFSDirectory(f); dir.CheckIndexOnClose = false; // we do this ourselves explicitly RandomIndexWriter iw = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp)); Document document = new Document(); FieldType ft = new FieldType(TextField.TYPE_STORED); switch (TestUtil.Next(random(), 0, 2)) { case 0: ft.IndexOptions = IndexOptions.DOCS_ONLY; break; case 1: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; break; default: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; break; } Field field = newField("field", "", ft); document.add(field); NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int freq = freqCutoff + 1; int freq = freqCutoff + 1; for (int i = 0; i < 10050; i++) { StringBuilder sb = new StringBuilder(); for (int j = 0; j < freq; j++) { sb.Append(df.format(i)); sb.Append(' '); // whitespace } field.StringValue = sb.ToString(); iw.addDocument(document); } IndexReader ir = iw.Reader; iw.close(); TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null); DocsEnum de = null; for (int i = 0; i < 10050; i++) { string expected = df.format(i); assertEquals(expected, te.next().utf8ToString()); de = TestUtil.docs(random(), te, null, de, DocsEnum.FLAG_NONE); assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc()); } ir.close(); TestUtil.checkIndex(dir); dir.close(); }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(InputIterator iterator, double ramBufferSizeMB) { if (iterator.HasPayloads()) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts()) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // TODO: messy ... java7 has Files.createTempDirectory // ... but 4.x is java6: File tempIndexPath = null; Random random = new Random(); while (true) { tempIndexPath = new File(directory, prefix + ".index." + random.Next(int.MaxValue)); if (tempIndexPath.mkdir()) { break; } } Directory dir = FSDirectory.Open(tempIndexPath); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_CURRENT, indexAnalyzer); iwc.OpenMode = IndexWriterConfig.OpenMode.CREATE; iwc.RAMBufferSizeMB = ramBufferSizeMB; IndexWriter writer = new IndexWriter(dir, iwc); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (true) { BytesRef surfaceForm = iterator.Next(); if (surfaceForm == null) { break; } field.StringValue = surfaceForm.Utf8ToString(); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new System.ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.Iterator(null); Outputs <long?> outputs = PositiveIntOutputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); IntsRef scratchInts = new IntsRef(); while (true) { BytesRef term = termsEnum.next(); if (term == null) { break; } int ngramCount = countGrams(term); if (ngramCount > grams) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq(); } builder.Add(Util.ToIntsRef(term, scratchInts), encodeWeight(termsEnum.TotalTermFreq())); } fst = builder.Finish(); if (fst == null) { throw new System.ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { try { if (success) { IOUtils.Close(writer, reader); } else { IOUtils.CloseWhileHandlingException(writer, reader); } } finally { foreach (string file in dir.ListAll()) { File path = new File(tempIndexPath, file); if (path.Delete() == false) { throw new InvalidOperationException("failed to remove " + path); } } if (tempIndexPath.Delete() == false) { throw new InvalidOperationException("failed to remove " + tempIndexPath); } dir.Dispose(); } } }