public override void WriteField(FieldInfo info, IIndexableField field) { fieldsStream.WriteVInt32(info.Number); int bits = 0; BytesRef bytes; string @string; // TODO: maybe a field should serialize itself? // this way we don't bake into indexer all these // specific encodings for different fields? and apps // can customize... // LUCENENET specific - To avoid boxing/unboxing, we don't // call GetNumericValue(). Instead, we check the field.NumericType and then // call the appropriate conversion method. if (field.NumericType != NumericFieldType.NONE) { switch (field.NumericType) { case NumericFieldType.BYTE: case NumericFieldType.INT16: case NumericFieldType.INT32: bits |= FIELD_IS_NUMERIC_INT; break; case NumericFieldType.INT64: bits |= FIELD_IS_NUMERIC_LONG; break; case NumericFieldType.SINGLE: bits |= FIELD_IS_NUMERIC_FLOAT; break; case NumericFieldType.DOUBLE: bits |= FIELD_IS_NUMERIC_DOUBLE; break; default: throw new ArgumentException("cannot store numeric type " + field.NumericType); } @string = null; bytes = null; } else { bytes = field.GetBinaryValue(); if (bytes != null) { bits |= FIELD_IS_BINARY; @string = null; } else { @string = field.GetStringValue(); if (@string == null) { throw new ArgumentException("field " + field.Name + " is stored but does not have binaryValue, stringValue nor numericValue"); } } } fieldsStream.WriteByte((byte)(sbyte)bits); if (bytes != null) { fieldsStream.WriteVInt32(bytes.Length); fieldsStream.WriteBytes(bytes.Bytes, bytes.Offset, bytes.Length); } else if (@string != null) { fieldsStream.WriteString(field.GetStringValue()); } else { switch (field.NumericType) { case NumericFieldType.BYTE: case NumericFieldType.INT16: case NumericFieldType.INT32: fieldsStream.WriteInt32(field.GetInt32Value().Value); break; case NumericFieldType.INT64: fieldsStream.WriteInt64(field.GetInt64Value().Value); break; case NumericFieldType.SINGLE: fieldsStream.WriteInt32(BitConversion.SingleToInt32Bits(field.GetSingleValue().Value)); break; case NumericFieldType.DOUBLE: fieldsStream.WriteInt64(BitConversion.DoubleToInt64Bits(field.GetDoubleValue().Value)); break; default: throw new InvalidOperationException("Cannot get here"); } } }
public virtual void SearchIndex(Directory dir, string oldName) { //QueryParser parser = new QueryParser("contents", new MockAnalyzer(random)); //Query query = parser.parse("handle:1"); IndexReader reader = DirectoryReader.Open(dir); IndexSearcher searcher = NewSearcher(reader); TestUtil.CheckIndex(dir); // true if this is a 4.0+ index bool is40Index = MultiFields.GetMergedFieldInfos(reader).FieldInfo("content5") != null; // true if this is a 4.2+ index bool is42Index = MultiFields.GetMergedFieldInfos(reader).FieldInfo("dvSortedSet") != null; Debug.Assert(is40Index); // NOTE: currently we can only do this on trunk! IBits liveDocs = MultiFields.GetLiveDocs(reader); for (int i = 0; i < 35; i++) { if (liveDocs.Get(i)) { Document d = reader.Document(i); IList <IIndexableField> fields = d.Fields; bool isProxDoc = d.GetField("content3") == null; if (isProxDoc) { int numFields = is40Index ? 7 : 5; Assert.AreEqual(numFields, fields.Count); IIndexableField f = d.GetField("id"); Assert.AreEqual("" + i, f.GetStringValue()); f = d.GetField("utf8"); Assert.AreEqual("Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", f.GetStringValue()); f = d.GetField("autf8"); Assert.AreEqual("Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", f.GetStringValue()); f = d.GetField("content2"); Assert.AreEqual("here is more content with aaa aaa aaa", f.GetStringValue()); f = d.GetField("fie\u2C77ld"); Assert.AreEqual("field with non-ascii name", f.GetStringValue()); } Fields tfvFields = reader.GetTermVectors(i); Assert.IsNotNull(tfvFields, "i=" + i); Terms tfv = tfvFields.GetTerms("utf8"); Assert.IsNotNull(tfv, "docID=" + i + " index=" + oldName); } else { // Only ID 7 is deleted Assert.AreEqual(7, i); } } if (is40Index) { // check docvalues fields NumericDocValues dvByte = MultiDocValues.GetNumericValues(reader, "dvByte"); BinaryDocValues dvBytesDerefFixed = MultiDocValues.GetBinaryValues(reader, "dvBytesDerefFixed"); BinaryDocValues dvBytesDerefVar = MultiDocValues.GetBinaryValues(reader, "dvBytesDerefVar"); SortedDocValues dvBytesSortedFixed = MultiDocValues.GetSortedValues(reader, "dvBytesSortedFixed"); SortedDocValues dvBytesSortedVar = MultiDocValues.GetSortedValues(reader, "dvBytesSortedVar"); BinaryDocValues dvBytesStraightFixed = MultiDocValues.GetBinaryValues(reader, "dvBytesStraightFixed"); BinaryDocValues dvBytesStraightVar = MultiDocValues.GetBinaryValues(reader, "dvBytesStraightVar"); NumericDocValues dvDouble = MultiDocValues.GetNumericValues(reader, "dvDouble"); NumericDocValues dvFloat = MultiDocValues.GetNumericValues(reader, "dvFloat"); NumericDocValues dvInt = MultiDocValues.GetNumericValues(reader, "dvInt"); NumericDocValues dvLong = MultiDocValues.GetNumericValues(reader, "dvLong"); NumericDocValues dvPacked = MultiDocValues.GetNumericValues(reader, "dvPacked"); NumericDocValues dvShort = MultiDocValues.GetNumericValues(reader, "dvShort"); SortedSetDocValues dvSortedSet = null; if (is42Index) { dvSortedSet = MultiDocValues.GetSortedSetValues(reader, "dvSortedSet"); } for (int i = 0; i < 35; i++) { int id = Convert.ToInt32(reader.Document(i).Get("id")); Assert.AreEqual(id, dvByte.Get(i)); sbyte[] bytes = new sbyte[] { (sbyte)((int)((uint)id >> 24)), (sbyte)((int)((uint)id >> 16)), (sbyte)((int)((uint)id >> 8)), (sbyte)id }; BytesRef expectedRef = new BytesRef((byte[])(Array)bytes); BytesRef scratch = new BytesRef(); dvBytesDerefFixed.Get(i, scratch); Assert.AreEqual(expectedRef, scratch); dvBytesDerefVar.Get(i, scratch); Assert.AreEqual(expectedRef, scratch); dvBytesSortedFixed.Get(i, scratch); Assert.AreEqual(expectedRef, scratch); dvBytesSortedVar.Get(i, scratch); Assert.AreEqual(expectedRef, scratch); dvBytesStraightFixed.Get(i, scratch); Assert.AreEqual(expectedRef, scratch); dvBytesStraightVar.Get(i, scratch); Assert.AreEqual(expectedRef, scratch); Assert.AreEqual((double)id, BitConverter.Int64BitsToDouble(dvDouble.Get(i)), 0D); Assert.AreEqual((float)id, Number.Int32BitsToSingle((int)dvFloat.Get(i)), 0F); Assert.AreEqual(id, dvInt.Get(i)); Assert.AreEqual(id, dvLong.Get(i)); Assert.AreEqual(id, dvPacked.Get(i)); Assert.AreEqual(id, dvShort.Get(i)); if (is42Index) { dvSortedSet.SetDocument(i); long ord = dvSortedSet.NextOrd(); Assert.AreEqual(SortedSetDocValues.NO_MORE_ORDS, dvSortedSet.NextOrd()); dvSortedSet.LookupOrd(ord, scratch); Assert.AreEqual(expectedRef, scratch); } } } ScoreDoc[] hits = searcher.Search(new TermQuery(new Term("content", "aaa")), null, 1000).ScoreDocs; // First document should be #0 Document doc = searcher.IndexReader.Document(hits[0].Doc); assertEquals("didn't get the right document first", "0", doc.Get("id")); DoTestHits(hits, 34, searcher.IndexReader); if (is40Index) { hits = searcher.Search(new TermQuery(new Term("content5", "aaa")), null, 1000).ScoreDocs; DoTestHits(hits, 34, searcher.IndexReader); hits = searcher.Search(new TermQuery(new Term("content6", "aaa")), null, 1000).ScoreDocs; DoTestHits(hits, 34, searcher.IndexReader); } hits = searcher.Search(new TermQuery(new Term("utf8", "\u0000")), null, 1000).ScoreDocs; Assert.AreEqual(34, hits.Length); hits = searcher.Search(new TermQuery(new Term("utf8", "lu\uD834\uDD1Ece\uD834\uDD60ne")), null, 1000).ScoreDocs; Assert.AreEqual(34, hits.Length); hits = searcher.Search(new TermQuery(new Term("utf8", "ab\ud917\udc17cd")), null, 1000).ScoreDocs; Assert.AreEqual(34, hits.Length); reader.Dispose(); }
public void TestWithDeletions() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); KeyValuePair <List <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), false, false); IDictionary <string, Document> docs = res.Value; List <String> invalidDocTerms = res.Key; Random rand = Random; List <string> termsToDel = new List <string>(); foreach (Document doc in docs.Values) { IIndexableField f2 = doc.GetField(FIELD_NAME); if (rand.nextBoolean() && f2 != null && !invalidDocTerms.Contains(f2.GetStringValue())) { termsToDel.Add(doc.Get(FIELD_NAME)); } writer.AddDocument(doc); } writer.Commit(); Term[] delTerms = new Term[termsToDel.size()]; for (int i = 0; i < termsToDel.size(); i++) { delTerms[i] = new Term(FIELD_NAME, termsToDel[i]); } foreach (Term delTerm in delTerms) { writer.DeleteDocuments(delTerm); } writer.Commit(); writer.Dispose(); foreach (string termToDel in termsToDel) { var toDel = docs[termToDel]; assertTrue(toDel != null); docs.Remove(termToDel); } IndexReader ir = DirectoryReader.Open(dir); assertEquals(ir.NumDocs, docs.size()); IDictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME); IInputIterator inputIterator = dictionary.GetEntryIterator(); BytesRef f; while ((f = inputIterator.Next()) != null) { var field = f.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME)))); IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME); assertEquals(inputIterator.Weight, (weightField != null) ? weightField.GetInt64ValueOrDefault() : 0); assertEquals(inputIterator.Payload, null); } foreach (string invalidTerm in invalidDocTerms) { var invalid = docs[invalidTerm]; docs.Remove(invalidTerm); assertNotNull(invalid); } assertTrue(docs.Count == 0); ir.Dispose(); dir.Dispose(); }
internal override bool Start(IIndexableField[] fields, int count) { doVectors = false; doVectorPositions = false; doVectorOffsets = false; doVectorPayloads = false; hasPayloads = false; for (int i = 0; i < count; i++) { IIndexableField field = fields[i]; if (field.IndexableFieldType.IsIndexed) { if (field.IndexableFieldType.StoreTermVectors) { doVectors = true; doVectorPositions |= field.IndexableFieldType.StoreTermVectorPositions; doVectorOffsets |= field.IndexableFieldType.StoreTermVectorOffsets; if (doVectorPositions) { doVectorPayloads |= field.IndexableFieldType.StoreTermVectorPayloads; } else if (field.IndexableFieldType.StoreTermVectorPayloads) { // TODO: move this check somewhere else, and impl the other missing ones throw new System.ArgumentException("cannot index term vector payloads without term vector positions (field=\"" + field.Name + "\")"); } } else { if (field.IndexableFieldType.StoreTermVectorOffsets) { throw new System.ArgumentException("cannot index term vector offsets when term vectors are not indexed (field=\"" + field.Name + "\")"); } if (field.IndexableFieldType.StoreTermVectorPositions) { throw new System.ArgumentException("cannot index term vector positions when term vectors are not indexed (field=\"" + field.Name + "\")"); } if (field.IndexableFieldType.StoreTermVectorPayloads) { throw new System.ArgumentException("cannot index term vector payloads when term vectors are not indexed (field=\"" + field.Name + "\")"); } } } else { if (field.IndexableFieldType.StoreTermVectors) { throw new System.ArgumentException("cannot index term vectors when field is not indexed (field=\"" + field.Name + "\")"); } if (field.IndexableFieldType.StoreTermVectorOffsets) { throw new System.ArgumentException("cannot index term vector offsets when field is not indexed (field=\"" + field.Name + "\")"); } if (field.IndexableFieldType.StoreTermVectorPositions) { throw new System.ArgumentException("cannot index term vector positions when field is not indexed (field=\"" + field.Name + "\")"); } if (field.IndexableFieldType.StoreTermVectorPayloads) { throw new System.ArgumentException("cannot index term vector payloads when field is not indexed (field=\"" + field.Name + "\")"); } } } if (doVectors) { termsWriter.hasVectors = true; if (termsHashPerField.bytesHash.Count != 0) { // Only necessary if previous doc hit a // non-aborting exception while writing vectors in // this field: termsHashPerField.Reset(); } } // TODO: only if needed for performance //perThread.postingsCount = 0; return(doVectors); }
public override void WriteField(FieldInfo info, IIndexableField field) { Write(FIELD); Write(info.Number.ToString(CultureInfo.InvariantCulture)); NewLine(); Write(NAME); Write(field.Name); NewLine(); Write(TYPE); // LUCENENET specific - To avoid boxing/unboxing, we don't // call GetNumericValue(). Instead, we check the field.NumericType and then // call the appropriate conversion method. if (field.NumericType != NumericFieldType.NONE) { switch (field.NumericType) { case NumericFieldType.BYTE: case NumericFieldType.INT16: case NumericFieldType.INT32: Write(TYPE_INT); NewLine(); Write(VALUE); Write(field.GetStringValue(CultureInfo.InvariantCulture)); NewLine(); break; case NumericFieldType.INT64: Write(TYPE_LONG); NewLine(); Write(VALUE); Write(field.GetStringValue(CultureInfo.InvariantCulture)); NewLine(); break; case NumericFieldType.SINGLE: Write(TYPE_FLOAT); NewLine(); Write(VALUE); // LUCENENET: Need to specify the "R" for round-trip: http://stackoverflow.com/a/611564 Write(field.GetStringValue("R", CultureInfo.InvariantCulture)); NewLine(); break; case NumericFieldType.DOUBLE: Write(TYPE_DOUBLE); NewLine(); Write(VALUE); // LUCENENET: Need to specify the "R" for round-trip: http://stackoverflow.com/a/611564 Write(field.GetStringValue("R", CultureInfo.InvariantCulture)); NewLine(); break; default: throw new ArgumentException("cannot store numeric type " + field.NumericType); } } else { BytesRef bytes = field.GetBinaryValue(); if (bytes != null) { Write(TYPE_BINARY); NewLine(); Write(VALUE); Write(bytes); NewLine(); } else if (field.GetStringValue() == null) { throw new ArgumentException("field " + field.Name + " is stored but does not have binaryValue, stringValue nor numericValue"); } else { Write(TYPE_STRING); NewLine(); Write(VALUE); Write(field.GetStringValue()); NewLine(); } } }
public override void WriteField(FieldInfo info, IIndexableField field) { FieldsStream.WriteVInt32(info.Number); int bits = 0; BytesRef bytes; string @string; // TODO: maybe a field should serialize itself? // this way we don't bake into indexer all these // specific encodings for different fields? and apps // can customize... object number = field.GetNumericValue(); if (number != null) { if (number is sbyte? || number is short? || number is int?) { bits |= Lucene3xStoredFieldsReader.FIELD_IS_NUMERIC_INT; } else if (number is long?) { bits |= Lucene3xStoredFieldsReader.FIELD_IS_NUMERIC_LONG; } else if (number is float?) { bits |= Lucene3xStoredFieldsReader.FIELD_IS_NUMERIC_FLOAT; } else if (number is double?) { bits |= Lucene3xStoredFieldsReader.FIELD_IS_NUMERIC_DOUBLE; } else { throw new System.ArgumentException("cannot store numeric type " + number.GetType()); } @string = null; bytes = null; } else { bytes = field.GetBinaryValue(); if (bytes != null) { bits |= Lucene3xStoredFieldsReader.FIELD_IS_BINARY; @string = null; } else { @string = field.GetStringValue(); if (@string == null) { throw new System.ArgumentException("field " + field.Name + " is stored but does not have binaryValue, stringValue nor numericValue"); } } } FieldsStream.WriteByte((byte)(sbyte)bits); if (bytes != null) { FieldsStream.WriteVInt32(bytes.Length); FieldsStream.WriteBytes(bytes.Bytes, bytes.Offset, bytes.Length); } else if (@string != null) { FieldsStream.WriteString(field.GetStringValue()); } else { if (number is sbyte? || number is short? || number is int?) { FieldsStream.WriteInt32((int)number); } else if (number is long?) { FieldsStream.WriteInt64((long)number); } else if (number is float?) { FieldsStream.WriteInt32(Number.SingleToInt32Bits((float)number)); } else if (number is double?) { FieldsStream.WriteInt64(BitConverter.DoubleToInt64Bits((double)number)); } else { Debug.Assert(false); } } }
public override void ProcessFields(IIndexableField[] fields, int count) { fieldState.Reset(); bool doInvert = consumer.Start(fields, count); for (int i = 0; i < count; i++) { IIndexableField field = fields[i]; IIndexableFieldType fieldType = field.IndexableFieldType; // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (fieldType.IsIndexed && doInvert) { bool analyzed = fieldType.IsTokenized && docState.analyzer != null; // if the field omits norms, the boost cannot be indexed. if (fieldType.OmitNorms && field.Boost != 1.0f) { throw new NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name + "'"); } // only bother checking offsets if something will consume them. // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed. bool checkOffsets = fieldType.IndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; int lastStartOffset = 0; if (i > 0) { fieldState.Position += analyzed ? docState.analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0; } /* * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses, * but rather a finally that takes note of the problem. */ bool succeededInProcessingField = false; TokenStream stream = field.GetTokenStream(docState.analyzer); // reset the TokenStream to the first token stream.Reset(); try { bool hasMoreTokens = stream.IncrementToken(); fieldState.AttributeSource = stream; IOffsetAttribute offsetAttribute = fieldState.AttributeSource.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute posIncrAttribute = fieldState.AttributeSource.AddAttribute <IPositionIncrementAttribute>(); if (hasMoreTokens) { consumer.Start(field); do { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID int posIncr = posIncrAttribute.PositionIncrement; if (posIncr < 0) { throw new ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name + "'"); } if (fieldState.Position == 0 && posIncr == 0) { throw new ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name + "'"); } int position = fieldState.Position + posIncr; if (position > 0) { // NOTE: confusing: this "mirrors" the // position++ we do below position--; } else if (position < 0) { throw new ArgumentException("position overflow for field '" + field.Name + "'"); } // position is legal, we can safely place it in fieldState now. // not sure if anything will use fieldState after non-aborting exc... fieldState.Position = position; if (posIncr == 0) { fieldState.NumOverlap++; } if (checkOffsets) { int startOffset = fieldState.Offset + offsetAttribute.StartOffset; int endOffset = fieldState.Offset + offsetAttribute.EndOffset; if (startOffset < 0 || endOffset < startOffset) { throw new ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name + "'"); } if (startOffset < lastStartOffset) { throw new ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name + "'"); } lastStartOffset = startOffset; } bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.Add(); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.Length++; fieldState.Position++; } while (stream.IncrementToken()); } // trigger streams to perform end-of-stream operations stream.End(); // TODO: maybe add some safety? then again, its already checked // when we come back around to the field... fieldState.Position += posIncrAttribute.PositionIncrement; fieldState.Offset += offsetAttribute.EndOffset; if (docState.maxTermPrefix != null) { string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'"; if (docState.infoStream.IsEnabled("IW")) { docState.infoStream.Message("IW", "ERROR: " + msg); } docState.maxTermPrefix = null; throw new ArgumentException(msg); } /* if success was false above there is an exception coming through and we won't get here.*/ succeededInProcessingField = true; } finally { if (!succeededInProcessingField) { IOUtils.DisposeWhileHandlingException(stream); } else { stream.Dispose(); } if (!succeededInProcessingField && docState.infoStream.IsEnabled("DW")) { docState.infoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name); } } fieldState.Offset += analyzed ? docState.analyzer.GetOffsetGap(fieldInfo.Name) : 0; fieldState.Boost *= field.Boost; } // LUCENE-2387: don't hang onto the field, so GC can // reclaim fields[i] = null; } consumer.Finish(); endConsumer.Finish(); }
internal abstract void Start(IIndexableField field);
public abstract void AddField(int docID, IIndexableField field, FieldInfo fieldInfo);