/// <summary> /// Constructs a <see cref="NumericConfig"/> object. /// </summary> /// <param name="precisionStep">the precision used to index the numeric values</param> /// <param name="format">the <see cref="NumberFormat"/> used to parse a <see cref="string"/> to an <see cref="object"/> representing a .NET numeric type.</param> /// <param name="type">the numeric type used to index the numeric values</param> /// <seealso cref="NumericConfig.PrecisionStep"/> /// <seealso cref="NumericConfig.NumberFormat"/> /// <seealso cref="Type"/> public NumericConfig(int precisionStep, NumberFormat format, FieldType.NumericType type) { PrecisionStep = precisionStep; NumberFormat = format; Type = type; }
public override void SetUp() { base.SetUp(); Dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), Dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMaxBufferedDocs(TestUtil.NextInt(Random(), 50, 1000))); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.OmitNorms = true; Field field = NewField("field", "", customType); doc.Add(field); NumberFormatInfo df = new NumberFormatInfo(); df.NumberDecimalDigits = 0; //NumberFormat df = new DecimalFormat("000", new DecimalFormatSymbols(Locale.ROOT)); for (int i = 0; i < 1000; i++) { field.StringValue = i.ToString(df); writer.AddDocument(doc); } Reader = writer.Reader; writer.Dispose(); Searcher = NewSearcher(Reader); }
public virtual void TestBasic() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS; Field f = NewField("foo", "this is a test test", ft); doc.Add(f); for (int i = 0; i < 100; i++) { w.AddDocument(doc); } IndexReader reader = w.Reader; w.Dispose(); Assert.IsNull(MultiFields.GetTermPositionsEnum(reader, null, "foo", new BytesRef("test"))); DocsEnum de = TestUtil.Docs(Random(), reader, "foo", new BytesRef("test"), null, null, DocsEnum.FLAG_FREQS); while (de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { Assert.AreEqual(2, de.Freq()); } reader.Dispose(); dir.Dispose(); }
public override void SetUp() { base.SetUp(); _dir = NewDirectory(); _indexWriter = new RandomIndexWriter(Random(), _dir, new MockAnalyzer(Random()), Similarity, TimeZone); FieldType ft = new FieldType(TextField.TYPE_STORED); ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; ft.StoreTermVectorPositions = true; Analyzer analyzer = new MockAnalyzer(Random()); Document doc; for (int i = 0; i < 100; i++) { doc = new Document(); doc.Add(new Field(_idFieldName, Random().toString(), ft)); doc.Add(new Field(_textFieldName, new StringBuilder(Random().toString()).append(Random().toString()).append( Random().toString()).toString(), ft)); doc.Add(new Field(_classFieldName, Random().toString(), ft)); _indexWriter.AddDocument(doc, analyzer); } _indexWriter.Commit(); _originalIndex = SlowCompositeReaderWrapper.Wrap(_indexWriter.Reader); }
public virtual void RunTest(Random random, Directory directory) { IndexWriter writer = new IndexWriter(directory, ((IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, ANALYZER).SetOpenMode(OpenMode_e.CREATE).SetMaxBufferedDocs(2)).SetMergePolicy(NewLogMergePolicy())); for (int iter = 0; iter < NUM_ITER; iter++) { int iterFinal = iter; ((LogMergePolicy)writer.Config.MergePolicy).MergeFactor = 1000; FieldType customType = new FieldType(StringField.TYPE_STORED); customType.OmitNorms = true; for (int i = 0; i < 200; i++) { Document d = new Document(); d.Add(NewField("id", Convert.ToString(i), customType)); d.Add(NewField("contents", English.IntToEnglish(i), customType)); writer.AddDocument(d); } ((LogMergePolicy)writer.Config.MergePolicy).MergeFactor = 4; ThreadClass[] threads = new ThreadClass[NUM_THREADS]; for (int i = 0; i < NUM_THREADS; i++) { int iFinal = i; IndexWriter writerFinal = writer; threads[i] = new ThreadAnonymousInnerClassHelper(this, iterFinal, customType, iFinal, writerFinal); } for (int i = 0; i < NUM_THREADS; i++) { threads[i].Start(); } for (int i = 0; i < NUM_THREADS; i++) { threads[i].Join(); } Assert.IsTrue(!Failed); int expectedDocCount = (int)((1 + iter) * (200 + 8 * NUM_ITER2 * (NUM_THREADS / 2.0) * (1 + NUM_THREADS))); Assert.AreEqual(expectedDocCount, writer.NumDocs(), "index=" + writer.SegString() + " numDocs=" + writer.NumDocs() + " maxDoc=" + writer.MaxDoc + " config=" + writer.Config); Assert.AreEqual(expectedDocCount, writer.MaxDoc, "index=" + writer.SegString() + " numDocs=" + writer.NumDocs() + " maxDoc=" + writer.MaxDoc + " config=" + writer.Config); writer.Dispose(); writer = new IndexWriter(directory, (IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, ANALYZER).SetOpenMode(OpenMode_e.APPEND).SetMaxBufferedDocs(2)); DirectoryReader reader = DirectoryReader.Open(directory); Assert.AreEqual(1, reader.Leaves.Count, "reader=" + reader); Assert.AreEqual(expectedDocCount, reader.NumDocs); reader.Dispose(); } writer.Dispose(); }
public virtual void TestDoubleOffsetCounting() { Directory dir = NewDirectory(); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); Document doc = new Document(); FieldType customType = new FieldType(StringField.TYPE_NOT_STORED); customType.StoreTermVectors = true; customType.StoreTermVectorPositions = true; customType.StoreTermVectorOffsets = true; Field f = NewField("field", "abcd", customType); doc.Add(f); doc.Add(f); Field f2 = NewField("field", "", customType); doc.Add(f2); doc.Add(f); w.AddDocument(doc); w.Dispose(); IndexReader r = DirectoryReader.Open(dir); Terms vector = r.GetTermVectors(0).Terms("field"); Assert.IsNotNull(vector); TermsEnum termsEnum = vector.Iterator(null); Assert.IsNotNull(termsEnum.Next()); Assert.AreEqual("", termsEnum.Term().Utf8ToString()); // Token "" occurred once Assert.AreEqual(1, termsEnum.TotalTermFreq()); DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.NextPosition(); Assert.AreEqual(8, dpEnum.StartOffset()); Assert.AreEqual(8, dpEnum.EndOffset()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); // Token "abcd" occurred three times Assert.AreEqual(new BytesRef("abcd"), termsEnum.Next()); dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.AreEqual(3, termsEnum.TotalTermFreq()); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.NextPosition(); Assert.AreEqual(0, dpEnum.StartOffset()); Assert.AreEqual(4, dpEnum.EndOffset()); dpEnum.NextPosition(); Assert.AreEqual(4, dpEnum.StartOffset()); Assert.AreEqual(8, dpEnum.EndOffset()); dpEnum.NextPosition(); Assert.AreEqual(8, dpEnum.StartOffset()); Assert.AreEqual(12, dpEnum.EndOffset()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); Assert.IsNull(termsEnum.Next()); r.Dispose(); dir.Dispose(); }
public virtual void TestBasic() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, Iwc); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; if (Random().NextBoolean()) { ft.StoreTermVectors = true; ft.StoreTermVectorPositions = Random().NextBoolean(); ft.StoreTermVectorOffsets = Random().NextBoolean(); } Token[] tokens = new Token[] { MakeToken("a", 1, 0, 6), MakeToken("b", 1, 8, 9), MakeToken("a", 1, 9, 17), MakeToken("c", 1, 19, 50) }; doc.Add(new Field("content", new CannedTokenStream(tokens), ft)); w.AddDocument(doc); IndexReader r = w.Reader; w.Dispose(); DocsAndPositionsEnum dp = MultiFields.GetTermPositionsEnum(r, null, "content", new BytesRef("a")); Assert.IsNotNull(dp); Assert.AreEqual(0, dp.NextDoc()); Assert.AreEqual(2, dp.Freq()); Assert.AreEqual(0, dp.NextPosition()); Assert.AreEqual(0, dp.StartOffset()); Assert.AreEqual(6, dp.EndOffset()); Assert.AreEqual(2, dp.NextPosition()); Assert.AreEqual(9, dp.StartOffset()); Assert.AreEqual(17, dp.EndOffset()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dp.NextDoc()); dp = MultiFields.GetTermPositionsEnum(r, null, "content", new BytesRef("b")); Assert.IsNotNull(dp); Assert.AreEqual(0, dp.NextDoc()); Assert.AreEqual(1, dp.Freq()); Assert.AreEqual(1, dp.NextPosition()); Assert.AreEqual(8, dp.StartOffset()); Assert.AreEqual(9, dp.EndOffset()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dp.NextDoc()); dp = MultiFields.GetTermPositionsEnum(r, null, "content", new BytesRef("c")); Assert.IsNotNull(dp); Assert.AreEqual(0, dp.NextDoc()); Assert.AreEqual(1, dp.Freq()); Assert.AreEqual(3, dp.NextPosition()); Assert.AreEqual(19, dp.StartOffset()); Assert.AreEqual(50, dp.EndOffset()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dp.NextDoc()); r.Dispose(); dir.Dispose(); }
public virtual void TestForceMergeDeletes() { Directory dir = NewDirectory(); IndexWriter writer = new IndexWriter(dir, (IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMaxBufferedDocs(2).SetRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)); Document document = new Document(); FieldType customType = new FieldType(); customType.Stored = true; FieldType customType1 = new FieldType(TextField.TYPE_NOT_STORED); customType1.Tokenized = false; customType1.StoreTermVectors = true; customType1.StoreTermVectorPositions = true; customType1.StoreTermVectorOffsets = true; Field idField = NewStringField("id", "", Field.Store.NO); document.Add(idField); Field storedField = NewField("stored", "stored", customType); document.Add(storedField); Field termVectorField = NewField("termVector", "termVector", customType1); document.Add(termVectorField); for (int i = 0; i < 10; i++) { idField.StringValue = "" + i; writer.AddDocument(document); } writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); Assert.AreEqual(10, ir.MaxDoc); Assert.AreEqual(10, ir.NumDocs); ir.Dispose(); IndexWriterConfig dontMergeConfig = (new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))).SetMergePolicy(NoMergePolicy.COMPOUND_FILES); writer = new IndexWriter(dir, dontMergeConfig); writer.DeleteDocuments(new Term("id", "0")); writer.DeleteDocuments(new Term("id", "7")); writer.Dispose(); ir = DirectoryReader.Open(dir); Assert.AreEqual(8, ir.NumDocs); ir.Dispose(); writer = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NewLogMergePolicy())); Assert.AreEqual(8, writer.NumDocs()); Assert.AreEqual(10, writer.MaxDoc); writer.ForceMergeDeletes(); Assert.AreEqual(8, writer.NumDocs()); writer.Dispose(); ir = DirectoryReader.Open(dir); Assert.AreEqual(8, ir.MaxDoc); Assert.AreEqual(8, ir.NumDocs); ir.Dispose(); dir.Dispose(); }
public virtual void TestPostings() { Directory dir = NewFSDirectory(CreateTempDir("postings")); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetCodec(Codec.ForName("Lucene40")); RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, iwc); Document doc = new Document(); // id field FieldType idType = new FieldType(StringField.TYPE_NOT_STORED); idType.StoreTermVectors = true; Field idField = new Field("id", "", idType); doc.Add(idField); // title field: short text field FieldType titleType = new FieldType(TextField.TYPE_NOT_STORED); titleType.StoreTermVectors = true; titleType.StoreTermVectorPositions = true; titleType.StoreTermVectorOffsets = true; titleType.IndexOptions = IndexOptions(); Field titleField = new Field("title", "", titleType); doc.Add(titleField); // body field: long text field FieldType bodyType = new FieldType(TextField.TYPE_NOT_STORED); bodyType.StoreTermVectors = true; bodyType.StoreTermVectorPositions = true; bodyType.StoreTermVectorOffsets = true; bodyType.IndexOptions = IndexOptions(); Field bodyField = new Field("body", "", bodyType); doc.Add(bodyField); int numDocs = AtLeast(1000); for (int i = 0; i < numDocs; i++) { idField.StringValue = Convert.ToString(i); titleField.StringValue = FieldValue(1); bodyField.StringValue = FieldValue(3); iw.AddDocument(doc); if (Random().Next(20) == 0) { iw.DeleteDocuments(new Term("id", Convert.ToString(i))); } } if (Random().NextBoolean()) { // delete 1-100% of docs iw.DeleteDocuments(new Term("title", Terms[Random().Next(Terms.Length)])); } iw.Dispose(); dir.Dispose(); // checkindex }
public virtual void TestMixedMerge() { Directory ram = NewDirectory(); Analyzer analyzer = new MockAnalyzer(Random()); IndexWriter writer = new IndexWriter(ram, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).SetMaxBufferedDocs(3).SetMergePolicy(NewLogMergePolicy(2))); Document d = new Document(); // this field will have norms Field f1 = NewTextField("f1", "this field has norms", Field.Store.NO); d.Add(f1); // this field will NOT have norms FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.OmitNorms = true; Field f2 = NewField("f2", "this field has NO norms in all docs", customType); d.Add(f2); for (int i = 0; i < 30; i++) { writer.AddDocument(d); } // now we add another document which has norms for field f2 and not for f1 and verify if the SegmentMerger // keep things constant d = new Document(); // Reverese d.Add(NewField("f1", "this field has norms", customType)); d.Add(NewTextField("f2", "this field has NO norms in all docs", Field.Store.NO)); for (int i = 0; i < 30; i++) { writer.AddDocument(d); } // force merge writer.ForceMerge(1); // flush writer.Dispose(); SegmentReader reader = GetOnlySegmentReader(DirectoryReader.Open(ram)); FieldInfos fi = reader.FieldInfos; Assert.IsTrue(fi.FieldInfo("f1").OmitsNorms(), "OmitNorms field bit should be set."); Assert.IsTrue(fi.FieldInfo("f2").OmitsNorms(), "OmitNorms field bit should be set."); reader.Dispose(); ram.Dispose(); }
/// <summary> /// Create a new mutable FieldType with all of the properties from <code>ref</code> /// </summary> public FieldType(FieldType @ref) { this.Indexed_Renamed = @ref.Indexed; this.Stored_Renamed = @ref.Stored; this.Tokenized_Renamed = @ref.Tokenized; this.StoreTermVectors_Renamed = @ref.StoreTermVectors; this.StoreTermVectorOffsets_Renamed = @ref.StoreTermVectorOffsets; this.StoreTermVectorPositions_Renamed = @ref.StoreTermVectorPositions; this.StoreTermVectorPayloads_Renamed = @ref.StoreTermVectorPayloads; this.OmitNorms_Renamed = @ref.OmitNorms; this._indexOptions = @ref.IndexOptions; this.docValueType = @ref.DocValueType; this.numericType = @ref.NumericTypeValue; // Do not copy frozen! }
public virtual void TestBogusTermVectors() { Directory dir = NewDirectory(); IndexWriter iw = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, null)); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; Field field = new Field("foo", "", ft); field.TokenStream = new CannedTokenStream(new Token("bar", 5, 10), new Token("bar", 1, 4)); doc.Add(field); iw.AddDocument(doc); iw.Dispose(); dir.Dispose(); // checkindex }
public virtual void TestBinaryFieldInIndex() { FieldType ft = new FieldType(); ft.Stored = true; IndexableField binaryFldStored = new StoredField("binaryStored", System.Text.UTF8Encoding.UTF8.GetBytes(BinaryValStored)); IndexableField stringFldStored = new Field("stringStored", BinaryValStored, ft); Documents.Document doc = new Documents.Document(); doc.Add(binaryFldStored); doc.Add(stringFldStored); /// <summary> /// test for field count </summary> Assert.AreEqual(2, doc.Fields.Count); /// <summary> /// add the doc to a ram index </summary> Directory dir = NewDirectory(); Random r = Random(); RandomIndexWriter writer = new RandomIndexWriter(r, dir); writer.AddDocument(doc); /// <summary> /// open a reader and fetch the document </summary> IndexReader reader = writer.Reader; Documents.Document docFromReader = reader.Document(0); Assert.IsTrue(docFromReader != null); /// <summary> /// fetch the binary stored field and compare it's content with the original one </summary> BytesRef bytes = docFromReader.GetBinaryValue("binaryStored"); Assert.IsNotNull(bytes); string binaryFldStoredTest = Encoding.UTF8.GetString((byte[])(Array)bytes.Bytes).Substring(bytes.Offset, bytes.Length); //new string(bytes.Bytes, bytes.Offset, bytes.Length, IOUtils.CHARSET_UTF_8); Assert.IsTrue(binaryFldStoredTest.Equals(BinaryValStored)); /// <summary> /// fetch the string field and compare it's content with the original one </summary> string stringFldStoredTest = docFromReader.Get("stringStored"); Assert.IsTrue(stringFldStoredTest.Equals(BinaryValStored)); writer.Dispose(); reader.Dispose(); dir.Dispose(); }
public virtual void TestAddIndexes() { Directory dir1 = NewDirectory(); Directory dir2 = NewDirectory(); IndexWriter writer = new IndexWriter(dir1, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NoMergePolicy.COMPOUND_FILES)); Document d1 = new Document(); d1.Add(new TextField("f1", "first field", Field.Store.YES)); d1.Add(new TextField("f2", "second field", Field.Store.YES)); writer.AddDocument(d1); writer.Dispose(); writer = new IndexWriter(dir2, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NoMergePolicy.COMPOUND_FILES)); Document d2 = new Document(); FieldType customType2 = new FieldType(TextField.TYPE_STORED); customType2.StoreTermVectors = true; d2.Add(new TextField("f2", "second field", Field.Store.YES)); d2.Add(new Field("f1", "first field", customType2)); d2.Add(new TextField("f3", "third field", Field.Store.YES)); d2.Add(new TextField("f4", "fourth field", Field.Store.YES)); writer.AddDocument(d2); writer.Dispose(); writer = new IndexWriter(dir1, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NoMergePolicy.COMPOUND_FILES)); writer.AddIndexes(dir2); writer.Dispose(); SegmentInfos sis = new SegmentInfos(); sis.Read(dir1); Assert.AreEqual(2, sis.Size()); FieldInfos fis1 = SegmentReader.ReadFieldInfos(sis.Info(0)); FieldInfos fis2 = SegmentReader.ReadFieldInfos(sis.Info(1)); Assert.AreEqual("f1", fis1.FieldInfo(0).Name); Assert.AreEqual("f2", fis1.FieldInfo(1).Name); // make sure the ordering of the "external" segment is preserved Assert.AreEqual("f2", fis2.FieldInfo(0).Name); Assert.AreEqual("f1", fis2.FieldInfo(1).Name); Assert.AreEqual("f3", fis2.FieldInfo(2).Name); Assert.AreEqual("f4", fis2.FieldInfo(3).Name); dir1.Dispose(); dir2.Dispose(); }
public virtual void TestBinaryField() { Documents.Document doc = new Documents.Document(); FieldType ft = new FieldType(); ft.Stored = true; IndexableField stringFld = new Field("string", BinaryVal, ft); IndexableField binaryFld = new StoredField("binary", BinaryVal.GetBytes(Encoding.UTF8)); IndexableField binaryFld2 = new StoredField("binary", BinaryVal2.GetBytes(Encoding.UTF8)); doc.Add(stringFld); doc.Add(binaryFld); Assert.AreEqual(2, doc.Fields.Count); Assert.IsTrue(binaryFld.BinaryValue != null); Assert.IsTrue(binaryFld.FieldType.Stored); Assert.IsFalse(binaryFld.FieldType.Indexed); string binaryTest = doc.GetBinaryValue("binary").Utf8ToString(); Assert.IsTrue(binaryTest.Equals(BinaryVal)); string stringTest = doc.Get("string"); Assert.IsTrue(binaryTest.Equals(stringTest)); doc.Add(binaryFld2); Assert.AreEqual(3, doc.Fields.Count); BytesRef[] binaryTests = doc.GetBinaryValues("binary"); Assert.AreEqual(2, binaryTests.Length); binaryTest = binaryTests[0].Utf8ToString(); string binaryTest2 = binaryTests[1].Utf8ToString(); Assert.IsFalse(binaryTest.Equals(binaryTest2)); Assert.IsTrue(binaryTest.Equals(BinaryVal)); Assert.IsTrue(binaryTest2.Equals(BinaryVal2)); doc.RemoveField("string"); Assert.AreEqual(2, doc.Fields.Count); doc.RemoveFields("binary"); Assert.AreEqual(0, doc.Fields.Count); }
public virtual void Test() { IndexWriter w = new IndexWriter(Dir, NewIndexWriterConfig(Random(), TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); try { FieldType ft = new FieldType(); ft.Indexed = true; ft.Stored = Random().NextBoolean(); ft.Freeze(); Document doc = new Document(); if (Random().NextBoolean()) { // totally ok short field value doc.Add(new Field(TestUtil.RandomSimpleString(Random(), 1, 10), TestUtil.RandomSimpleString(Random(), 1, 10), ft)); } // problematic field string name = TestUtil.RandomSimpleString(Random(), 1, 50); string value = TestUtil.RandomSimpleString(Random(), MinTestTermLength, MaxTestTermLegnth); Field f = new Field(name, value, ft); if (Random().NextBoolean()) { // totally ok short field value doc.Add(new Field(TestUtil.RandomSimpleString(Random(), 1, 10), TestUtil.RandomSimpleString(Random(), 1, 10), ft)); } doc.Add(f); try { w.AddDocument(doc); Assert.Fail("Did not get an exception from adding a monster term"); } catch (System.ArgumentException e) { string maxLengthMsg = Convert.ToString(IndexWriter.MAX_TERM_LENGTH); string msg = e.Message; Assert.IsTrue(msg.Contains("immense term"), "IllegalArgumentException didn't mention 'immense term': " + msg); Assert.IsTrue(msg.Contains(maxLengthMsg), "IllegalArgumentException didn't mention max length (" + maxLengthMsg + "): " + msg); Assert.IsTrue(msg.Contains(name), "IllegalArgumentException didn't mention field name (" + name + "): " + msg); } } finally { w.Dispose(); } }
public virtual void TestPositionsSimple() { Directory directory = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); for (int i = 0; i < 39; i++) { Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.OmitNorms = true; doc.Add(NewField(FieldName, "1 2 3 4 5 6 7 8 9 10 " + "1 2 3 4 5 6 7 8 9 10 " + "1 2 3 4 5 6 7 8 9 10 " + "1 2 3 4 5 6 7 8 9 10", customType)); writer.AddDocument(doc); } IndexReader reader = writer.Reader; writer.Dispose(); int num = AtLeast(13); for (int i = 0; i < num; i++) { BytesRef bytes = new BytesRef("1"); IndexReaderContext topReaderContext = reader.Context; foreach (AtomicReaderContext atomicReaderContext in topReaderContext.Leaves) { DocsAndPositionsEnum docsAndPosEnum = GetDocsAndPositions((AtomicReader)atomicReaderContext.Reader, bytes, null); Assert.IsNotNull(docsAndPosEnum); if (atomicReaderContext.Reader.MaxDoc == 0) { continue; } int advance = docsAndPosEnum.Advance(Random().Next(atomicReaderContext.Reader.MaxDoc)); do { string msg = "Advanced to: " + advance + " current doc: " + docsAndPosEnum.DocID(); // TODO: + " usePayloads: " + usePayload; Assert.AreEqual(4, docsAndPosEnum.Freq(), msg); Assert.AreEqual(0, docsAndPosEnum.NextPosition(), msg); Assert.AreEqual(4, docsAndPosEnum.Freq(), msg); Assert.AreEqual(10, docsAndPosEnum.NextPosition(), msg); Assert.AreEqual(4, docsAndPosEnum.Freq(), msg); Assert.AreEqual(20, docsAndPosEnum.NextPosition(), msg); Assert.AreEqual(4, docsAndPosEnum.Freq(), msg); Assert.AreEqual(30, docsAndPosEnum.NextPosition(), msg); } while (docsAndPosEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); } } reader.Dispose(); directory.Dispose(); }
private Document NewDocument() { Document doc = new Document(); foreach (FieldInfo.IndexOptions option in Enum.GetValues(typeof(FieldInfo.IndexOptions))) { var ft = new FieldType(TextField.TYPE_NOT_STORED) { StoreTermVectors = true, StoreTermVectorOffsets = true, StoreTermVectorPositions = true, StoreTermVectorPayloads = true, IndexOptions = option }; // turn on tvs for a cross-check, since we rely upon checkindex in this test (for now) doc.Add(new Field(option.ToString(), "", ft)); } return doc; }
public virtual void TestMixupDocs() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, iwc); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.StoreTermVectors = true; customType.StoreTermVectorPositions = true; customType.StoreTermVectorPayloads = true; customType.StoreTermVectorOffsets = Random().NextBoolean(); Field field = new Field("field", "", customType); TokenStream ts = new MockTokenizer(new StringReader("here we go"), MockTokenizer.WHITESPACE, true); Assert.IsFalse(ts.HasAttribute<IPayloadAttribute>()); field.TokenStream = ts; doc.Add(field); writer.AddDocument(doc); Token withPayload = new Token("withPayload", 0, 11); withPayload.Payload = new BytesRef("test"); ts = new CannedTokenStream(withPayload); Assert.IsTrue(ts.HasAttribute<IPayloadAttribute>()); field.TokenStream = ts; writer.AddDocument(doc); ts = new MockTokenizer(new StringReader("another"), MockTokenizer.WHITESPACE, true); Assert.IsFalse(ts.HasAttribute<IPayloadAttribute>()); field.TokenStream = ts; writer.AddDocument(doc); DirectoryReader reader = writer.Reader; Terms terms = reader.GetTermVector(1, "field"); Debug.Assert(terms != null); TermsEnum termsEnum = terms.Iterator(null); Assert.IsTrue(termsEnum.SeekExact(new BytesRef("withPayload"))); DocsAndPositionsEnum de = termsEnum.DocsAndPositions(null, null); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(0, de.NextPosition()); Assert.AreEqual(new BytesRef("test"), de.Payload); writer.Dispose(); reader.Dispose(); dir.Dispose(); }
public virtual void TestSimpleCase() { string[] keywords = new string[] { "1", "2" }; string[] unindexed = new string[] { "Netherlands", "Italy" }; string[] unstored = new string[] { "Amsterdam has lots of bridges", "Venice has lots of canals" }; string[] text = new string[] { "Amsterdam", "Venice" }; Directory dir = NewDirectory(); IndexWriter modifier = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false)).SetMaxBufferedDeleteTerms(1)); FieldType custom1 = new FieldType(); custom1.Stored = true; for (int i = 0; i < keywords.Length; i++) { Document doc = new Document(); doc.Add(NewStringField("id", keywords[i], Field.Store.YES)); doc.Add(NewField("country", unindexed[i], custom1)); doc.Add(NewTextField("contents", unstored[i], Field.Store.NO)); doc.Add(NewTextField("city", text[i], Field.Store.YES)); modifier.AddDocument(doc); } modifier.ForceMerge(1); modifier.Commit(); Term term = new Term("city", "Amsterdam"); int hitCount = GetHitCount(dir, term); Assert.AreEqual(1, hitCount); if (VERBOSE) { Console.WriteLine("\nTEST: now delete by term=" + term); } modifier.DeleteDocuments(term); modifier.Commit(); if (VERBOSE) { Console.WriteLine("\nTEST: now getHitCount"); } hitCount = GetHitCount(dir, term); Assert.AreEqual(0, hitCount); modifier.Dispose(); dir.Dispose(); }
public virtual void Test([ValueSource(typeof(ConcurrentMergeSchedulers), "Values")]IConcurrentMergeScheduler scheduler) { BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BPostings")); if (dir is MockDirectoryWrapper) { ((MockDirectoryWrapper)dir).Throttling = MockDirectoryWrapper.Throttling_e.NEVER; } var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())) .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .SetRAMBufferSizeMB(256.0) .SetMergeScheduler(scheduler) .SetMergePolicy(NewLogMergePolicy(false, 10)) .SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE); IndexWriter w = new IndexWriter(dir, config); MergePolicy mp = w.Config.MergePolicy; if (mp is LogByteSizeMergePolicy) { // 1 petabyte: ((LogByteSizeMergePolicy)mp).MaxMergeMB = 1024 * 1024 * 1024; } Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.OmitNorms = true; ft.IndexOptions = FieldInfo.IndexOptions.DOCS_ONLY; Field field = new Field("field", new MyTokenStream(), ft); doc.Add(field); int numDocs = (int.MaxValue / 26) + 1; for (int i = 0; i < numDocs; i++) { w.AddDocument(doc); if (VERBOSE && i % 100000 == 0) { Console.WriteLine(i + " of " + numDocs + "..."); } } w.ForceMerge(1); w.Dispose(); dir.Dispose(); }
public virtual void TestDeletePartiallyWrittenFilesIfAbort() { Directory dir = NewDirectory(); IndexWriterConfig iwConf = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwConf.SetMaxBufferedDocs(RandomInts.NextIntBetween(Random(), 2, 30)); iwConf.SetCodec(CompressingCodec.RandomInstance(Random())); // disable CFS because this test checks file names iwConf.SetMergePolicy(NewLogMergePolicy(false)); iwConf.SetUseCompoundFile(false); RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, iwConf); Document validDoc = new Document(); validDoc.Add(new IntField("id", 0, Field.Store.YES)); iw.AddDocument(validDoc); iw.Commit(); // make sure that #writeField will fail to trigger an abort Document invalidDoc = new Document(); FieldType fieldType = new FieldType(); fieldType.Stored = true; invalidDoc.Add(new FieldAnonymousInnerClassHelper(this, fieldType)); try { iw.AddDocument(invalidDoc); iw.Commit(); } finally { int counter = 0; foreach (string fileName in dir.ListAll()) { if (fileName.EndsWith(".fdt") || fileName.EndsWith(".fdx")) { counter++; } } // Only one .fdt and one .fdx files must have been found Assert.AreEqual(2, counter); iw.Dispose(); dir.Dispose(); } }
public override void SetUp() { base.SetUp(); Directory = NewDirectory(); IndexWriter writer = new IndexWriter(Directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NewLogMergePolicy())); //writer.setNoCFSRatio(0.0); //writer.infoStream = System.out; FieldType customType = new FieldType(TextField.TYPE_STORED); customType.Tokenized = false; customType.StoreTermVectors = true; for (int i = 0; i < NumDocs; i++) { Documents.Document doc = new Documents.Document(); Field fld = NewField("field", English.IntToEnglish(i), customType); doc.Add(fld); writer.AddDocument(doc); } writer.Dispose(); }
public void BeforeClass() { Directory = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), Directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true)).SetMergePolicy(NewLogMergePolicy())); //writer.setNoCFSRatio(1.0); //writer.infoStream = System.out; for (int i = 0; i < 1000; i++) { Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_STORED); int mod3 = i % 3; int mod2 = i % 2; if (mod2 == 0 && mod3 == 0) { ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; ft.StoreTermVectorPositions = true; } else if (mod2 == 0) { ft.StoreTermVectors = true; ft.StoreTermVectorPositions = true; } else if (mod3 == 0) { ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; } else { ft.StoreTermVectors = true; } doc.Add(new Field("field", English.IntToEnglish(i), ft)); //test no term vectors too doc.Add(new TextField("noTV", English.IntToEnglish(i), Field.Store.YES)); writer.AddDocument(doc); } Reader = writer.Reader; writer.Dispose(); }
public virtual void TestChangeGaps() { // LUCENE-5324: check that it is possible to change the wrapper's gaps int positionGap = Random().Next(1000); int offsetGap = Random().Next(1000); Analyzer @delegate = new MockAnalyzer(Random()); Analyzer a = new AnalyzerWrapperAnonymousInnerClassHelper2(this, @delegate.Strategy, positionGap, offsetGap, @delegate); RandomIndexWriter writer = new RandomIndexWriter(Random(), NewDirectory()); Document doc = new Document(); FieldType ft = new FieldType(); ft.Indexed = true; ft.IndexOptions = FieldInfo.IndexOptions.DOCS_ONLY; ft.Tokenized = true; ft.StoreTermVectors = true; ft.StoreTermVectorPositions = true; ft.StoreTermVectorOffsets = true; doc.Add(new Field("f", "a", ft)); doc.Add(new Field("f", "a", ft)); writer.AddDocument(doc, a); AtomicReader reader = GetOnlySegmentReader(writer.Reader); Fields fields = reader.GetTermVectors(0); Terms terms = fields.Terms("f"); TermsEnum te = terms.Iterator(null); Assert.AreEqual(new BytesRef("a"), te.Next()); DocsAndPositionsEnum dpe = te.DocsAndPositions(null, null); Assert.AreEqual(0, dpe.NextDoc()); Assert.AreEqual(2, dpe.Freq()); Assert.AreEqual(0, dpe.NextPosition()); Assert.AreEqual(0, dpe.StartOffset()); int endOffset = dpe.EndOffset(); Assert.AreEqual(1 + positionGap, dpe.NextPosition()); Assert.AreEqual(1 + endOffset + offsetGap, dpe.EndOffset()); Assert.AreEqual(null, te.Next()); reader.Dispose(); writer.Dispose(); writer.w.Directory.Dispose(); }
public virtual void TestBinary() { Directory dir = NewDirectory(); RandomIndexWriter iw = new RandomIndexWriter(Random(), dir); BytesRef bytes = new BytesRef(2); BinaryTokenStream tokenStream = new BinaryTokenStream(bytes); for (int i = 0; i < 256; i++) { bytes.Bytes[0] = (byte)i; bytes.Bytes[1] = unchecked((byte)(255 - i)); bytes.Length = 2; Document doc = new Document(); FieldType customType = new FieldType(); customType.Stored = true; doc.Add(new Field("id", "" + i, customType)); doc.Add(new TextField("bytes", tokenStream)); iw.AddDocument(doc); } IndexReader ir = iw.Reader; iw.Dispose(); IndexSearcher @is = NewSearcher(ir); for (int i = 0; i < 256; i++) { bytes.Bytes[0] = (byte)i; bytes.Bytes[1] = unchecked((byte)(255 - i)); bytes.Length = 2; TopDocs docs = @is.Search(new TermQuery(new Term("bytes", bytes)), 5); Assert.AreEqual(1, docs.TotalHits); Assert.AreEqual("" + i, @is.Doc(docs.ScoreDocs[0].Doc).Get("id")); } ir.Dispose(); dir.Dispose(); }
public virtual void TestNoOrds() { Directory dir = NewDirectory(); RandomIndexWriter iw = new RandomIndexWriter(Random(), dir); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.StoreTermVectors = true; doc.Add(new Field("foo", "this is a test", ft)); iw.AddDocument(doc); AtomicReader ir = GetOnlySegmentReader(iw.Reader); Terms terms = ir.GetTermVector(0, "foo"); Assert.IsNotNull(terms); TermsEnum termsEnum = terms.Iterator(null); Assert.AreEqual(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef("this"))); try { termsEnum.Ord(); Assert.Fail(); } catch (System.NotSupportedException expected) { // expected exception } try { termsEnum.SeekExact(0); Assert.Fail(); } catch (System.NotSupportedException expected) { // expected exception } ir.Dispose(); iw.Dispose(); dir.Dispose(); }
public void BeforeClass() { string[] data = new string[] { "A 1 2 3 4 5 6", "Z 4 5 6", null, "B 2 4 5 6", "Y 3 5 6", null, "C 3 6", "X 4 5 6" }; Small = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), Small, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false)).SetMergePolicy(NewLogMergePolicy())); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.Tokenized = false; for (int i = 0; i < data.Length; i++) { Document doc = new Document(); doc.Add(NewField("id", Convert.ToString(i), customType)); // Field.Keyword("id",String.valueOf(i))); doc.Add(NewField("all", "all", customType)); // Field.Keyword("all","all")); if (null != data[i]) { doc.Add(NewTextField("data", data[i], Field.Store.YES)); // Field.Text("data",data[i])); } writer.AddDocument(doc); } Reader = writer.Reader; writer.Dispose(); }
/// <summary> /// Expert: allows you to customize the {@link /// FieldType}. </summary> /// <param name="name"> field name </param> /// <param name="value"> 64-bit long value </param> /// <param name="type"> customized field type: must have <seealso cref="FieldType#numericType()"/> /// of <seealso cref="FieldType.NumericType#LONG"/>. </param> /// <exception cref="IllegalArgumentException"> if the field name or type is null, or /// if the field type does not have a LONG numericType() </exception> public LongField(string name, long value, FieldType type) : base(name, type) { if (type.NumericTypeValue != Documents.FieldType.NumericType.LONG) { throw new System.ArgumentException("type.numericType() must be LONG but got " + type.NumericTypeValue); } FieldsData = Convert.ToInt64(value); }
public static Field NewField(Random random, string name, string value, FieldType type) { name = new string(name.ToCharArray()); if (Usually(random) || !type.Indexed) { // most of the time, don't modify the params return new Field(name, value, type); } // TODO: once all core & test codecs can index // offsets, sometimes randomly turn on offsets if we are // already indexing positions... FieldType newType = new FieldType(type); if (!newType.Stored && random.NextBoolean()) { newType.Stored = true; // randomly store it } if (!newType.StoreTermVectors && random.NextBoolean()) { newType.StoreTermVectors = true; if (!newType.StoreTermVectorOffsets) { newType.StoreTermVectorOffsets = random.NextBoolean(); } if (!newType.StoreTermVectorPositions) { newType.StoreTermVectorPositions = random.NextBoolean(); if (newType.StoreTermVectorPositions && !newType.StoreTermVectorPayloads && !OLD_FORMAT_IMPERSONATION_IS_ACTIVE) { newType.StoreTermVectorPayloads = random.NextBoolean(); } } } // TODO: we need to do this, but smarter, ie, most of // the time we set the same value for a given field but // sometimes (rarely) we change it up: /* if (newType.OmitsNorms()) { newType.setOmitNorms(random.NextBoolean()); } */ return new Field(name, value, newType); }
/// <summary> /// Add documents. /// </summary> /// <param name="writer">The index writer.</param> /// <param name="facetWriter">The facet index writer.</param> /// <param name="directoryInfo">The directory information where all the files that are to be added are located.</param> /// <param name="files">The list of files that are to be added.</param> /// <param name="documents">The supported documents search filter, used to indicate what files are to be added.</param> /// <param name="facetField">The facet field information.</param> /// <param name="config">The facet configuration information.</param> public void AddDocuments(Lucene.Net.Index.IndexWriter writer, DirectoryTaxonomyWriter facetWriter, DirectoryInfo directoryInfo, string[] files, SupportedDocumentExtension documents, FacetField facetField, FacetsConfig config) { FieldType pathFieldType = new Lucene.Net.Documents.FieldType() { Indexed = true, Tokenized = false, Stored = true, IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, }; FieldType contentFieldType = new Lucene.Net.Documents.FieldType() { Indexed = true, Tokenized = documents.TokenizeContent, Stored = documents.StoreContent, IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, }; // For each file. for (int i = 0; i < files.Length; i++) { // If the file exists if (File.Exists(files[i])) { Lucene.Net.Documents.Document document = new Lucene.Net.Documents.Document(); try { FileInfo fileInfo = new FileInfo(files[i]); string file = files[i].Replace(directoryInfo.Root.FullName, "").ToLower(); Lucene.Net.Documents.Field path = new Field("path", file.ToLower().Replace("\\", "/"), pathFieldType); Lucene.Net.Documents.Field modified = new Field("modified", fileInfo.LastWriteTime.ToShortDateString() + " " + fileInfo.LastWriteTime.ToShortTimeString(), pathFieldType); // Add the fields. document.Add(facetField); document.Add(path); document.Add(modified); // Create the stream reader. OpenDocument(files[i]); string content = Nequeo.Xml.Document.ExtractContent(_xDocument); // If content exists. if (!String.IsNullOrEmpty(content)) { // Split the white spaces from the text. string[] words = content.Words(); // If words exist. if (words != null && words.Length > 0) { // Add the query for each word. for (int j = 0; j < words.Length; j++) { // Format the word. string word = words[j].ToLower().RemovePunctuationFromStartAndEnd(); // If a word exists. if (!String.IsNullOrEmpty(word)) { Lucene.Net.Documents.Field contentField = new Field("facetcontent", word, contentFieldType); document.Add(contentField); } } } } // Add the document. writer.AddDocument(config.Build(facetWriter, document)); _document.Close(); // Commit after a set number of documents. documents.TotalDocumentSize += fileInfo.Length; if (documents.TotalDocumentSize > documents.MaxDocumentSizePerCommit) { // Commit the index. writer.Commit(); facetWriter.Commit(); documents.TotalDocumentSize = 0; } } catch (Exception) { throw; } finally { CloseDocument(); } } } }
/// <summary> /// Add text to the existing index. /// </summary> /// <param name="writer">The index writer.</param> /// <param name="facetWriter">The facet index writer.</param> /// <param name="addTextData">The text data to add.</param> /// <param name="config">The facet configuration information.</param> public void AddText(Lucene.Net.Index.IndexWriter writer, DirectoryTaxonomyWriter facetWriter, Dictionary <FacetField, AddTextData[]> addTextData, FacetsConfig config) { long totalTextLength = 0; long maxTextLengthBeforeCommit = 30000000L; // For each text facet. foreach (KeyValuePair <FacetField, AddTextData[]> item in addTextData) { // If text exists. if (item.Value != null && item.Value.Length > 0) { // Add the text. FieldType nameFieldType = new Lucene.Net.Documents.FieldType() { Indexed = true, Tokenized = false, Stored = true, IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, }; // Add the text. FieldType completeFieldType = new Lucene.Net.Documents.FieldType() { Indexed = true, Tokenized = false, Stored = true, IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, }; // Add the text. FieldType textFieldType = new Lucene.Net.Documents.FieldType() { Indexed = true, Tokenized = false, Stored = false, IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, }; // For each text. foreach (AddTextData data in item.Value) { // Should the data be stored. completeFieldType.Stored = data.StoreText; // Create the document. Lucene.Net.Documents.Document document = new Lucene.Net.Documents.Document(); Lucene.Net.Documents.Field textName = new Field("textname", data.Name.ToLower(), nameFieldType); Lucene.Net.Documents.Field textComplete = new Field("textcomplete", data.Text.ToLower(), completeFieldType); document.Add(item.Key); document.Add(textName); document.Add(textComplete); // Split the white spaces from the text. string[] words = data.Text.Words(); // If words exist. if (words != null && words.Length > 0) { // Add the query for each word. for (int j = 0; j < words.Length; j++) { // Format the word. string word = words[j].ToLower().RemovePunctuationFromStartAndEnd(); // If a word exists. if (!String.IsNullOrEmpty(word)) { Lucene.Net.Documents.Field textData = new Field("facetcontent", word, textFieldType); document.Add(textData); } } } // Add the document. writer.AddDocument(config.Build(facetWriter, document)); // Commit after a set number of documents. totalTextLength += (long)data.Text.Length; if (totalTextLength > maxTextLengthBeforeCommit) { // Commit the index. writer.Commit(); facetWriter.Commit(); totalTextLength = 0; } } } } // Commit the index. writer.Commit(); facetWriter.Commit(); }
/// <summary> /// Add text to the existing index. /// </summary> /// <param name="directoryIndexInfo">The directory infomation where the index files are located.</param> /// <param name="addTextData">The text data to add.</param> public void AddText(DirectoryInfo directoryIndexInfo, AddTextData[] addTextData) { Lucene.Net.Index.IndexWriter writer = null; Lucene.Net.Store.Directory directory = null; long totalTextLength = 0; long maxTextLengthBeforeCommit = 30000000L; try { // If text exists. if (addTextData != null && addTextData.Length > 0) { // Create the analyzer. SimpleAnalyzer simpleAnalyzer = new Analyzer.SimpleAnalyzer(); StandardAnalyzer standardAnalyzer = new Analyzer.StandardAnalyzer(simpleAnalyzer); // Create the index writer. directory = FSDirectory.Open(directoryIndexInfo); IndexWriterConfig indexConfig = new IndexWriterConfig(Lucene.Net.Util.LuceneVersion.LUCENE_48, standardAnalyzer); indexConfig.SetOpenMode(IndexWriterConfig.OpenMode_e.APPEND); // Open existing or create new. writer = new IndexWriter(directory, indexConfig); // Add the text. FieldType nameFieldType = new Lucene.Net.Documents.FieldType() { Indexed = true, Tokenized = false, Stored = true, IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, }; // Add the text. FieldType completeFieldType = new Lucene.Net.Documents.FieldType() { Indexed = true, Tokenized = false, Stored = true, IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, }; // Add the text. FieldType textFieldType = new Lucene.Net.Documents.FieldType() { Indexed = true, Tokenized = false, Stored = false, IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, }; // For each text. foreach (AddTextData data in addTextData) { // Should the data be stored. completeFieldType.Stored = data.StoreText; // Create the document. Lucene.Net.Documents.Document document = new Lucene.Net.Documents.Document(); Lucene.Net.Documents.Field textName = new Field("textname", data.Name.ToLower(), nameFieldType); Lucene.Net.Documents.Field textComplete = new Field("textcomplete", data.Text.ToLower(), completeFieldType); document.Add(textName); document.Add(textComplete); // Split the white spaces from the text. string[] words = data.Text.Words(); // If words exist. if (words != null && words.Length > 0) { // Add the query for each word. for (int j = 0; j < words.Length; j++) { // Format the word. string word = words[j].ToLower().RemovePunctuationFromStartAndEnd(); // If a word exists. if (!String.IsNullOrEmpty(word)) { Lucene.Net.Documents.Field textData = new Field("text", word, textFieldType); document.Add(textData); } } } // Add the document. writer.AddDocument(document.Fields); // Commit after a set number of documents. totalTextLength += (long)data.Text.Length; if (totalTextLength > maxTextLengthBeforeCommit) { // Commit the index. writer.Commit(); totalTextLength = 0; } } // Commit the index. writer.Commit(); } } catch (Exception) { throw; } finally { if (writer != null) { writer.Dispose(); } if (directory != null) { directory.Dispose(); } } }
/// <summary> /// Add documents. /// </summary> /// <param name="writer">The index writer.</param> /// <param name="directoryInfo">The directory information where all the files that are to be added are located.</param> /// <param name="files">The list of files that are to be added.</param> /// <param name="documents">The supported documents search filter, used to indicate what files are to be added.</param> public void AddDocuments(Lucene.Net.Index.IndexWriter writer, DirectoryInfo directoryInfo, string[] files, SupportedDocumentExtension documents) { System.Windows.Forms.RichTextBox textbox = new System.Windows.Forms.RichTextBox(); FieldType pathFieldType = new Lucene.Net.Documents.FieldType() { Indexed = true, Tokenized = false, Stored = true, IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, }; FieldType contentFieldType = new Lucene.Net.Documents.FieldType() { Indexed = true, Tokenized = documents.TokenizeContent, Stored = documents.StoreContent, IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, }; // For each file. for (int i = 0; i < files.Length; i++) { // If the file exists if (File.Exists(files[i])) { Lucene.Net.Documents.Document document = new Lucene.Net.Documents.Document(); System.IO.StreamReader stream = null; try { FileInfo fileInfo = new FileInfo(files[i]); string file = files[i].Replace(directoryInfo.Root.FullName, "").ToLower(); Lucene.Net.Documents.Field path = new Field("path", file.ToLower().Replace("\\", "/"), pathFieldType); Lucene.Net.Documents.Field modified = new Field("modified", fileInfo.LastWriteTime.ToShortDateString() + " " + fileInfo.LastWriteTime.ToShortTimeString(), pathFieldType); // Add the fields. document.Add(path); document.Add(modified); // Create the stream reader. stream = new StreamReader(files[i]); string contentRtf = stream.ReadToEnd(); textbox.Rtf = contentRtf; string content = textbox.Text; // If content exists. if (!String.IsNullOrEmpty(content)) { // Split the white spaces from the text. string[] words = content.Words(); // If words exist. if (words != null && words.Length > 0) { // Add the query for each word. for (int j = 0; j < words.Length; j++) { // Format the word. string word = words[j].ToLower().RemovePunctuationFromStartAndEnd(); // If a word exists. if (!String.IsNullOrEmpty(word)) { Lucene.Net.Documents.Field contentField = new Field("content", word, contentFieldType); document.Add(contentField); } } } } // Add the document. writer.AddDocument(document.Fields); stream.Close(); // Commit after a set number of documents. documents.TotalDocumentSize += fileInfo.Length; if (documents.TotalDocumentSize > documents.MaxDocumentSizePerCommit) { writer.Commit(); documents.TotalDocumentSize = 0; } } catch (Exception) { throw; } finally { if (stream != null) { stream.Dispose(); } } } } }