Exemple #1
0
        internal void  AddDocument(Document doc)
        {
            indexStream.WriteLong(fieldsStream.GetFilePointer());

            int storedCount = 0;

            System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator();
            while (fieldIterator.MoveNext())
            {
                Fieldable field = (Fieldable)fieldIterator.Current;
                if (field.IsStored())
                {
                    storedCount++;
                }
            }
            fieldsStream.WriteVInt(storedCount);

            fieldIterator = doc.GetFields().GetEnumerator();
            while (fieldIterator.MoveNext())
            {
                Fieldable field = (Fieldable)fieldIterator.Current;
                if (field.IsStored())
                {
                    WriteField(fieldInfos.FieldInfo(field.Name()), field);
                }
            }
        }
Exemple #2
0
        public virtual void  TestAddDocument()
        {
            Document testDoc = new Document();

            DocHelper.SetupDoc(testDoc);
            Analyzer    analyzer = new WhitespaceAnalyzer();
            IndexWriter writer   = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);

            writer.AddDocument(testDoc);
            writer.Flush();
            SegmentInfo info = writer.NewestSegment();

            writer.Close();
            //After adding the document, we should be able to read it back in
            SegmentReader reader = SegmentReader.Get(info);

            Assert.IsTrue(reader != null);
            Document doc = reader.Document(0);

            Assert.IsTrue(doc != null);

            //System.out.println("Document: " + doc);
            Fieldable[] fields = doc.GetFields("textField2");
            Assert.IsTrue(fields != null && fields.Length == 1);
            Assert.IsTrue(fields[0].StringValue().Equals(DocHelper.FIELD_2_TEXT));
            Assert.IsTrue(fields[0].IsTermVectorStored());

            fields = doc.GetFields("textField1");
            Assert.IsTrue(fields != null && fields.Length == 1);
            Assert.IsTrue(fields[0].StringValue().Equals(DocHelper.FIELD_1_TEXT));
            Assert.IsFalse(fields[0].IsTermVectorStored());

            fields = doc.GetFields("keyField");
            Assert.IsTrue(fields != null && fields.Length == 1);
            Assert.IsTrue(fields[0].StringValue().Equals(DocHelper.KEYWORD_TEXT));

            fields = doc.GetFields(DocHelper.NO_NORMS_KEY);
            Assert.IsTrue(fields != null && fields.Length == 1);
            Assert.IsTrue(fields[0].StringValue().Equals(DocHelper.NO_NORMS_TEXT));

            fields = doc.GetFields(DocHelper.TEXT_FIELD_3_KEY);
            Assert.IsTrue(fields != null && fields.Length == 1);
            Assert.IsTrue(fields[0].StringValue().Equals(DocHelper.FIELD_3_TEXT));

            // test that the norms are not present in the segment if
            // omitNorms is true
            for (int i = 0; i < reader.core_ForNUnit.fieldInfos_ForNUnit.Size(); i++)
            {
                FieldInfo fi = reader.core_ForNUnit.fieldInfos_ForNUnit.FieldInfo(i);
                if (fi.isIndexed_ForNUnit)
                {
                    Assert.IsTrue(fi.omitNorms_ForNUnit == !reader.HasNorms(fi.name_ForNUnit));
                }
            }
        }
        public virtual void  TestFieldSelector()
        {
            RAMDirectory  ramDirectory1, ramDirectory2;
            IndexSearcher indexSearcher1, indexSearcher2;

            ramDirectory1 = new RAMDirectory();
            ramDirectory2 = new RAMDirectory();
            Query query = new TermQuery(new Term("contents", "doc0"));

            // Now put the documents in a different index
            InitIndex(ramDirectory1, 10, true, null); // documents with a single token "doc0", "doc1", etc...
            InitIndex(ramDirectory2, 10, true, "x");  // documents with two tokens "doc0" and "x", "doc1" and x, etc...

            indexSearcher1 = new IndexSearcher(ramDirectory1, true);
            indexSearcher2 = new IndexSearcher(ramDirectory2, true);

            MultiSearcher searcher = GetMultiSearcherInstance(new Searcher[] { indexSearcher1, indexSearcher2 });

            Assert.IsTrue(searcher != null, "searcher is null and it shouldn't be");
            ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
            Assert.IsTrue(hits != null, "hits is null and it shouldn't be");
            Assert.IsTrue(hits.Length == 2, hits.Length + " does not equal: " + 2);
            Document document = searcher.Doc(hits[0].Doc);

            Assert.IsTrue(document != null, "document is null and it shouldn't be");
            Assert.IsTrue(document.GetFields().Count == 2, "document.getFields() Size: " + document.GetFields().Count + " is not: " + 2);
            //Should be one document from each directory
            //they both have two fields, contents and other
            ISet <string> ftl = Support.Compatibility.SetFactory.CreateHashSet <string>();

            ftl.Add("other");
            SetBasedFieldSelector fs = new SetBasedFieldSelector(ftl, Support.Compatibility.SetFactory.CreateHashSet <string>());

            document = searcher.Doc(hits[0].Doc, fs);
            Assert.IsTrue(document != null, "document is null and it shouldn't be");
            Assert.IsTrue(document.GetFields().Count == 1, "document.getFields() Size: " + document.GetFields().Count + " is not: " + 1);
            System.String value_Renamed = document.Get("contents");
            Assert.IsTrue(value_Renamed == null, "value is not null and it should be");
            value_Renamed = document.Get("other");
            Assert.IsTrue(value_Renamed != null, "value is null and it shouldn't be");
            ftl.Clear();
            ftl.Add("contents");
            fs            = new SetBasedFieldSelector(ftl, Support.Compatibility.SetFactory.CreateHashSet <string>());
            document      = searcher.Doc(hits[1].Doc, fs);
            value_Renamed = document.Get("contents");
            Assert.IsTrue(value_Renamed != null, "value is null and it shouldn't be");
            value_Renamed = document.Get("other");
            Assert.IsTrue(value_Renamed == null, "value is not null and it should be");
        }
Exemple #4
0
 /// <summary>Adds field info for a Document. </summary>
 public void  Add(Document doc)
 {
     System.Collections.IList       fields        = doc.GetFields();
     System.Collections.IEnumerator fieldIterator = fields.GetEnumerator();
     while (fieldIterator.MoveNext())
     {
         Fieldable field = (Fieldable)fieldIterator.Current;
         Add(field.Name(), field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms());
     }
 }
Exemple #5
0
		/// <summary>Adds field info for a Document. </summary>
		public void  Add(Document doc)
		{
			System.Collections.IList fields = doc.GetFields();
			System.Collections.IEnumerator fieldIterator = fields.GetEnumerator();
			while (fieldIterator.MoveNext())
			{
				Fieldable field = (Fieldable) fieldIterator.Current;
				Add(field.Name(), field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms());
			}
		}
Exemple #6
0
 /// <summary>Adds field info for a Document. </summary>
 public void  Add(Document doc)
 {
     lock (this)
     {
         IList <Fieldable> fields = doc.GetFields();
         foreach (Fieldable field in fields)
         {
             Add(field.Name(), field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false, field.GetOmitTf());
         }
     }
 }
Exemple #7
0
        internal void  AddDocument(Document doc)
        {
            indexStream.WriteLong(fieldsStream.GetFilePointer());

            int storedCount = 0;

            foreach (Fieldable field in doc.GetFields())
            {
                if (field.IsStored())
                {
                    storedCount++;
                }
            }
            fieldsStream.WriteVInt(storedCount);

            foreach (Fieldable field in doc.GetFields())
            {
                if (field.IsStored())
                {
                    WriteField(fieldInfos.FieldInfo(field.Name()), field);
                }
            }
        }
Exemple #8
0
        public virtual void  TestAddDocument()
        {
            Analyzer       analyzer   = new WhitespaceAnalyzer();
            Similarity     similarity = Similarity.GetDefault();
            DocumentWriter writer     = new DocumentWriter(dir, analyzer, similarity, 50);

            Assert.IsTrue(writer != null);
            try
            {
                writer.AddDocument("test", testDoc);
                //After adding the document, we should be able to read it back in
                SegmentReader reader = new SegmentReader(new SegmentInfo("test", 1, dir));
                Assert.IsTrue(reader != null);
                Document doc = reader.Document(0);
                Assert.IsTrue(doc != null);

                //System.out.println("Document: " + doc);
                Field[] fields = doc.GetFields("textField2");
                Assert.IsTrue(fields != null && fields.Length == 1);
                Assert.IsTrue(fields[0].StringValue().Equals(DocHelper.FIELD_2_TEXT));
                Assert.IsTrue(fields[0].IsTermVectorStored() == true);

                fields = doc.GetFields("textField1");
                Assert.IsTrue(fields != null && fields.Length == 1);
                Assert.IsTrue(fields[0].StringValue().Equals(DocHelper.FIELD_1_TEXT));
                Assert.IsTrue(fields[0].IsTermVectorStored() == false);

                fields = doc.GetFields("keyField");
                Assert.IsTrue(fields != null && fields.Length == 1);
                Assert.IsTrue(fields[0].StringValue().Equals(DocHelper.KEYWORD_TEXT));
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }
Exemple #9
0
        public virtual void  TestDocument()
        {
            Assert.IsTrue(reader.NumDocs() == 1);
            Assert.IsTrue(reader.MaxDoc() >= 1);
            Document result = reader.Document(0);

            Assert.IsTrue(result != null);
            //There are 2 unstored fields on the document that are not preserved across writing
            Assert.IsTrue(DocHelper.NumFields(result) == DocHelper.NumFields(testDoc) - DocHelper.unstored.Count);

            System.Collections.IList fields = result.GetFields();
            for (System.Collections.IEnumerator iter = fields.GetEnumerator(); iter.MoveNext();)
            {
                Fieldable field = (Fieldable)iter.Current;
                Assert.IsTrue(field != null);
                Assert.IsTrue(DocHelper.nameValues.Contains(field.Name()));
            }
        }
Exemple #10
0
        /// <summary> Test stored fields for a segment.</summary>
        private Status.StoredFieldStatus TestStoredFields(SegmentInfo info, SegmentReader reader, System.Globalization.NumberFormatInfo format)
        {
            Status.StoredFieldStatus status = new Status.StoredFieldStatus();

            try
            {
                if (infoStream != null)
                {
                    infoStream.Write("    test: stored fields.......");
                }

                // Scan stored fields for all documents
                for (int j = 0; j < info.docCount; ++j)
                {
                    if (!reader.IsDeleted(j))
                    {
                        status.docCount++;
                        Document doc = reader.Document(j);
                        status.totFields += doc.GetFields().Count;
                    }
                }

                // Validate docCount
                if (status.docCount != reader.NumDocs())
                {
                    throw new System.SystemException("docCount=" + status.docCount + " but saw " + status.docCount + " undeleted docs");
                }

                Msg(string.Format(format, "OK [{0:d} total field count; avg {1:f} fields per doc]", new object[] { status.totFields, (((float)status.totFields) / status.docCount) }));
            }
            catch (System.Exception e)
            {
                Msg("ERROR [" + System.Convert.ToString(e.Message) + "]");
                status.error = e;
                if (infoStream != null)
                {
                    infoStream.WriteLine(e.StackTrace);
                }
            }

            return(status);
        }
        public virtual void  TestDocument()
        {
            Directory      dir1 = GetDir1();
            Directory      dir2 = GetDir2();
            ParallelReader pr   = new ParallelReader();

            pr.Add(IndexReader.Open(dir1));
            pr.Add(IndexReader.Open(dir2));

            Document doc11  = pr.Document(0, new MapFieldSelector(new System.String[] { "f1" }));
            Document doc24  = pr.Document(1, new MapFieldSelector(new System.Collections.ArrayList(new System.String[] { "f4" })));
            Document doc223 = pr.Document(1, new MapFieldSelector(new System.String[] { "f2", "f3" }));

            Assert.AreEqual(1, doc11.GetFields().Count);
            Assert.AreEqual(1, doc24.GetFields().Count);
            Assert.AreEqual(2, doc223.GetFields().Count);

            Assert.AreEqual("v1", doc11.Get("f1"));
            Assert.AreEqual("v2", doc24.Get("f4"));
            Assert.AreEqual("v2", doc223.Get("f2"));
            Assert.AreEqual("v2", doc223.Get("f3"));
        }
Exemple #12
0
		public static int NumFields(Document doc)
		{
			return doc.GetFields().Count;
		}
        public virtual void  searchIndex(System.String dirName, System.String oldName)
        {
            //QueryParser parser = new QueryParser("contents", new WhitespaceAnalyzer());
            //Query query = parser.parse("handle:1");

            dirName = FullDir(dirName);

            Directory     dir      = FSDirectory.Open(new System.IO.FileInfo(dirName));
            IndexSearcher searcher = new IndexSearcher(dir);
            IndexReader   reader   = searcher.GetIndexReader();

            _TestUtil.CheckIndex(dir);

            for (int i = 0; i < 35; i++)
            {
                if (!reader.IsDeleted(i))
                {
                    Document d = reader.Document(i);
                    System.Collections.IList fields = d.GetFields();
                    if (!oldName.StartsWith("19.") && !oldName.StartsWith("20.") && !oldName.StartsWith("21.") && !oldName.StartsWith("22."))
                    {
                        if (d.GetField("content3") == null)
                        {
                            Assert.AreEqual(5, fields.Count);
                            Field f = (Field)d.GetField("id");
                            Assert.AreEqual("" + i, f.StringValue());

                            f = (Field)d.GetField("utf8");
                            Assert.AreEqual("Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", f.StringValue());

                            f = (Field)d.GetField("autf8");
                            Assert.AreEqual("Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", f.StringValue());

                            f = (Field)d.GetField("content2");
                            Assert.AreEqual("here is more content with aaa aaa aaa", f.StringValue());

                            f = (Field)d.GetField("fie\u2C77ld");
                            Assert.AreEqual("field with non-ascii name", f.StringValue());
                        }
                    }
                }
                // Only ID 7 is deleted
                else
                {
                    Assert.AreEqual(7, i);
                }
            }

            ScoreDoc[] hits = searcher.Search(new TermQuery(new Term("content", "aaa")), null, 1000).ScoreDocs;

            // First document should be #21 since it's norm was
            // increased:
            Document d2 = searcher.Doc(hits[0].doc);

            Assert.AreEqual("21", d2.Get("id"), "didn't get the right document first");

            TestHits(hits, 34, searcher.GetIndexReader());

            if (!oldName.StartsWith("19.") && !oldName.StartsWith("20.") && !oldName.StartsWith("21.") && !oldName.StartsWith("22."))
            {
                // Test on indices >= 2.3
                hits = searcher.Search(new TermQuery(new Term("utf8", "\u0000")), null, 1000).ScoreDocs;
                Assert.AreEqual(34, hits.Length);
                hits = searcher.Search(new TermQuery(new Term("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne")), null, 1000).ScoreDocs;
                Assert.AreEqual(34, hits.Length);
                hits = searcher.Search(new TermQuery(new Term("utf8", "ab\ud917\udc17cd")), null, 1000).ScoreDocs;
                Assert.AreEqual(34, hits.Length);
            }

            searcher.Close();
            dir.Close();
        }
Exemple #14
0
		internal void  AddDocument(Document doc)
		{
			indexStream.WriteLong(fieldsStream.GetFilePointer());
			
			int storedCount = 0;
			System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator();
			while (fieldIterator.MoveNext())
			{
				Fieldable field = (Fieldable) fieldIterator.Current;
				if (field.IsStored())
					storedCount++;
			}
			fieldsStream.WriteVInt(storedCount);
			
			fieldIterator = doc.GetFields().GetEnumerator();
			while (fieldIterator.MoveNext())
			{
				Fieldable field = (Fieldable) fieldIterator.Current;
				// if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
				// and field.binaryValue() already returns the compressed value for a field
				// with isCompressed()==true, so we disable compression in that case
				bool disableCompression = (field is FieldsReader.FieldForMerge);
				if (field.IsStored())
				{
					fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name()));
					
					byte bits = 0;
					if (field.IsTokenized())
						bits |= FieldsWriter.FIELD_IS_TOKENIZED;
					if (field.IsBinary())
						bits |= FieldsWriter.FIELD_IS_BINARY;
					if (field.IsCompressed())
						bits |= FieldsWriter.FIELD_IS_COMPRESSED;
					
					fieldsStream.WriteByte(bits);
					
					if (field.IsCompressed())
					{
						// compression is enabled for the current field
						byte[] data = null;
						
						if (disableCompression)
						{
							// optimized case for merging, the data
							// is already compressed
							data = field.BinaryValue();
						}
						else
						{
							// check if it is a binary field
							if (field.IsBinary())
							{
								data = Compress(field.BinaryValue());
							}
							else
							{
								data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue()));
							}
						}
						int len = data.Length;
						fieldsStream.WriteVInt(len);
						fieldsStream.WriteBytes(data, len);
					}
					else
					{
						// compression is disabled for the current field
						if (field.IsBinary())
						{
							byte[] data = field.BinaryValue();
							int len = data.Length;
							fieldsStream.WriteVInt(len);
							fieldsStream.WriteBytes(data, len);
						}
						else
						{
							fieldsStream.WriteString(field.StringValue());
						}
					}
				}
			}
		}
Exemple #15
0
		internal void  AddDocument(Document doc)
		{
			indexStream.WriteLong(fieldsStream.GetFilePointer());
			
			int storedCount = 0;
            foreach(Fieldable field in doc.GetFields())
            {			
				if (field.IsStored())
					storedCount++;
			}
			fieldsStream.WriteVInt(storedCount);
			
			foreach(Fieldable field in doc.GetFields())
            {
				if (field.IsStored())
					WriteField(fieldInfos.FieldInfo(field.Name()), field);
			}
		}
Exemple #16
0
        /// <summary>Returns true if index is clean, else false.</summary>
        public static bool Check(Directory dir, bool doFix)
        {
            System.Globalization.NumberFormatInfo nf = System.Globalization.CultureInfo.CurrentCulture.NumberFormat;
            SegmentInfos sis = new SegmentInfos();

            try
            {
                sis.Read(dir);
            }
            catch (System.Exception t)
            {
                out_Renamed.WriteLine("ERROR: could not read any segments file in directory");
                out_Renamed.Write(t.StackTrace);
                out_Renamed.Flush();
                return(false);
            }

            int numSegments = sis.Count;

            System.String segmentsFileName = sis.GetCurrentSegmentFileName();
            IndexInput    input            = null;

            try
            {
                input = dir.OpenInput(segmentsFileName);
            }
            catch (System.Exception t)
            {
                out_Renamed.WriteLine("ERROR: could not open segments file in directory");
                out_Renamed.Write(t.StackTrace);
                out_Renamed.Flush();
                return(false);
            }
            int format = 0;

            try
            {
                format = input.ReadInt();
            }
            catch (System.Exception t)
            {
                out_Renamed.WriteLine("ERROR: could not read segment file version in directory");
                out_Renamed.Write(t.StackTrace);
                out_Renamed.Flush();
                return(false);
            }
            finally
            {
                if (input != null)
                {
                    input.Close();
                }
            }

            System.String sFormat = "";
            bool          skip    = false;

            if (format == SegmentInfos.FORMAT)
            {
                sFormat = "FORMAT [Lucene Pre-2.1]";
            }
            if (format == SegmentInfos.FORMAT_LOCKLESS)
            {
                sFormat = "FORMAT_LOCKLESS [Lucene 2.1]";
            }
            else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE)
            {
                sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]";
            }
            else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE)
            {
                sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]";
            }
            else if (format < SegmentInfos.FORMAT_SHARED_DOC_STORE)
            {
                sFormat = "int=" + format + " [newer version of Lucene than this tool]";
                skip    = true;
            }
            else
            {
                sFormat = format + " [Lucene 1.3 or prior]";
            }

            out_Renamed.WriteLine("Segments file=" + segmentsFileName + " numSegments=" + numSegments + " version=" + sFormat);

            if (skip)
            {
                out_Renamed.WriteLine("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting");
                return(false);
            }

            SegmentInfos newSIS = (SegmentInfos)sis.Clone();

            newSIS.Clear();
            bool changed         = false;
            int  totLoseDocCount = 0;
            int  numBadSegments  = 0;

            for (int i = 0; i < numSegments; i++)
            {
                SegmentInfo info = sis.Info(i);
                out_Renamed.WriteLine("  " + (1 + i) + " of " + numSegments + ": name=" + info.name + " docCount=" + info.docCount);
                int toLoseDocCount = info.docCount;

                SegmentReader reader = null;

                try
                {
                    out_Renamed.WriteLine("    compound=" + info.GetUseCompoundFile());
                    out_Renamed.WriteLine("    numFiles=" + info.Files().Count);
                    out_Renamed.WriteLine(String.Format(nf, "    size (MB)={0:f}", new Object[] { (info.SizeInBytes() / (1024.0 * 1024.0)) }));
                    int docStoreOffset = info.GetDocStoreOffset();
                    if (docStoreOffset != -1)
                    {
                        out_Renamed.WriteLine("    docStoreOffset=" + docStoreOffset);
                        out_Renamed.WriteLine("    docStoreSegment=" + info.GetDocStoreSegment());
                        out_Renamed.WriteLine("    docStoreIsCompoundFile=" + info.GetDocStoreIsCompoundFile());
                    }
                    System.String delFileName = info.GetDelFileName();
                    if (delFileName == null)
                    {
                        out_Renamed.WriteLine("    no deletions");
                    }
                    else
                    {
                        out_Renamed.WriteLine("    has deletions [delFileName=" + delFileName + "]");
                    }
                    out_Renamed.Write("    test: open reader.........");
                    reader = SegmentReader.Get(info);
                    int numDocs = reader.NumDocs();
                    toLoseDocCount = numDocs;
                    if (reader.HasDeletions())
                    {
                        out_Renamed.WriteLine("OK [" + (info.docCount - numDocs) + " deleted docs]");
                    }
                    else
                    {
                        out_Renamed.WriteLine("OK");
                    }

                    out_Renamed.Write("    test: fields, norms.......");
                    System.Collections.IDictionary fieldNames = (System.Collections.IDictionary)reader.GetFieldNames(IndexReader.FieldOption.ALL);
                    System.Collections.IEnumerator it         = fieldNames.Keys.GetEnumerator();
                    while (it.MoveNext())
                    {
                        System.String fieldName = (System.String)it.Current;
                        byte[]        b         = reader.Norms(fieldName);
                        if (b.Length != info.docCount)
                        {
                            throw new System.SystemException("norms for field \"" + fieldName + "\" is length " + b.Length + " != maxDoc " + info.docCount);
                        }
                    }
                    out_Renamed.WriteLine("OK [" + fieldNames.Count + " fields]");

                    out_Renamed.Write("    test: terms, freq, prox...");
                    TermEnum      termEnum      = reader.Terms();
                    TermPositions termPositions = reader.TermPositions();

                    // Used only to count up # deleted docs for this
                    // term
                    MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader);

                    long termCount = 0;
                    long totFreq   = 0;
                    long totPos    = 0;
                    while (termEnum.Next())
                    {
                        termCount++;
                        Term term    = termEnum.Term();
                        int  docFreq = termEnum.DocFreq();
                        termPositions.Seek(term);
                        int lastDoc = -1;
                        int freq0   = 0;
                        totFreq += docFreq;
                        while (termPositions.Next())
                        {
                            freq0++;
                            int doc  = termPositions.Doc();
                            int freq = termPositions.Freq();
                            if (doc <= lastDoc)
                            {
                                throw new System.SystemException("term " + term + ": doc " + doc + " < lastDoc " + lastDoc);
                            }
                            lastDoc = doc;
                            if (freq <= 0)
                            {
                                throw new System.SystemException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");
                            }

                            int lastPos = -1;
                            totPos += freq;
                            for (int j = 0; j < freq; j++)
                            {
                                int pos = termPositions.NextPosition();
                                if (pos < 0)
                                {
                                    throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
                                }
                                if (pos <= lastPos)
                                {
                                    throw new System.SystemException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
                                }
                            }
                        }

                        // Now count how many deleted docs occurred in
                        // this term:
                        int delCount;
                        if (reader.HasDeletions())
                        {
                            myTermDocs.Seek(term);
                            while (myTermDocs.Next())
                            {
                            }
                            delCount = myTermDocs.delCount;
                        }
                        else
                        {
                            delCount = 0;
                        }

                        if (freq0 + delCount != docFreq)
                        {
                            throw new System.SystemException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount);
                        }
                    }

                    out_Renamed.WriteLine("OK [" + termCount + " terms; " + totFreq + " terms/docs pairs; " + totPos + " tokens]");

                    out_Renamed.Write("    test: stored fields.......");
                    int  docCount  = 0;
                    long totFields = 0;
                    for (int j = 0; j < info.docCount; j++)
                    {
                        if (!reader.IsDeleted(j))
                        {
                            docCount++;
                            Document doc = reader.Document(j);
                            totFields += doc.GetFields().Count;
                        }
                    }

                    if (docCount != reader.NumDocs())
                    {
                        throw new System.SystemException("docCount=" + docCount + " but saw " + docCount + " undeleted docs");
                    }

                    out_Renamed.WriteLine(String.Format(nf, "OK [{0:d} total field count; avg {1:f} fields per doc]", new Object[] { totFields, (((float)totFields) / docCount) }));

                    out_Renamed.Write("    test: term vectors........");
                    int totVectors = 0;
                    for (int j = 0; j < info.docCount; j++)
                    {
                        if (!reader.IsDeleted(j))
                        {
                            TermFreqVector[] tfv = reader.GetTermFreqVectors(j);
                            if (tfv != null)
                            {
                                totVectors += tfv.Length;
                            }
                        }
                    }

                    out_Renamed.WriteLine(String.Format(nf, "OK [{0:d} total vector count; avg {1:f} term/freq vector fields per doc]", new Object[] { totVectors, (((float)totVectors) / docCount) }));
                    out_Renamed.WriteLine("");
                }
                catch (System.Exception t)
                {
                    out_Renamed.WriteLine("FAILED");
                    System.String comment;
                    if (doFix)
                    {
                        comment = "will remove reference to this segment (-fix is specified)";
                    }
                    else
                    {
                        comment = "would remove reference to this segment (-fix was not specified)";
                    }
                    out_Renamed.WriteLine("    WARNING: " + comment + "; full exception:");
                    out_Renamed.Write(t.StackTrace);
                    out_Renamed.Flush();
                    out_Renamed.WriteLine("");
                    totLoseDocCount += toLoseDocCount;
                    numBadSegments++;
                    changed = true;
                    continue;
                }
                finally
                {
                    if (reader != null)
                    {
                        reader.Close();
                    }
                }

                // Keeper
                newSIS.Add(info.Clone());
            }

            if (!changed)
            {
                out_Renamed.WriteLine("No problems were detected with this index.\n");
                return(true);
            }
            else
            {
                out_Renamed.WriteLine("WARNING: " + numBadSegments + " broken segments detected");
                if (doFix)
                {
                    out_Renamed.WriteLine("WARNING: " + totLoseDocCount + " documents will be lost");
                }
                else
                {
                    out_Renamed.WriteLine("WARNING: " + totLoseDocCount + " documents would be lost if -fix were specified");
                }
                out_Renamed.WriteLine();
            }

            if (doFix)
            {
                out_Renamed.WriteLine("NOTE: will write new segments file in 5 seconds; this will remove " + totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!");
                for (int i = 0; i < 5; i++)
                {
                    try
                    {
                        System.Threading.Thread.Sleep(new System.TimeSpan((System.Int64) 10000 * 1000));
                    }
                    catch (System.Threading.ThreadInterruptedException)
                    {
                        SupportClass.ThreadClass.Current().Interrupt();
                        i--;
                        continue;
                    }

                    out_Renamed.WriteLine("  " + (5 - i) + "...");
                }
                out_Renamed.Write("Writing...");
                try
                {
                    newSIS.Write(dir);
                }
                catch (System.Exception t)
                {
                    out_Renamed.WriteLine("FAILED; exiting");
                    out_Renamed.Write(t.StackTrace);
                    out_Renamed.Flush();
                    return(false);
                }
                out_Renamed.WriteLine("OK");
                out_Renamed.WriteLine("Wrote new segments file \"" + newSIS.GetCurrentSegmentFileName() + "\"");
            }
            else
            {
                out_Renamed.WriteLine("NOTE: would write new segments file [-fix was not specified]");
            }
            out_Renamed.WriteLine("");

            return(false);
        }
Exemple #17
0
        public override DocumentsWriter.DocWriter ProcessDocument()
        {
            consumer.StartDocument();
            fieldsWriter.StartDocument();

            Document doc = docState.doc;

            System.Diagnostics.Debug.Assert(docFieldProcessor.docWriter.writer.TestPoint("DocumentsWriter.ThreadState.init start"));

            fieldCount = 0;

            int thisFieldGen = fieldGen++;

            System.Collections.IList docFields = doc.GetFields();
            int numDocFields = docFields.Count;

            // Absorb any new fields first seen in this document.
            // Also absorb any changes to fields we had already
            // seen before (eg suddenly turning on norms or
            // vectors, etc.):

            for (int i = 0; i < numDocFields; i++)
            {
                Fieldable     field     = (Fieldable)docFields[i];
                System.String fieldName = field.Name();

                // Make sure we have a PerField allocated
                int hashPos = fieldName.GetHashCode() & hashMask;
                DocFieldProcessorPerField fp = fieldHash[hashPos];
                while (fp != null && !fp.fieldInfo.name.Equals(fieldName))
                {
                    fp = fp.next;
                }

                if (fp == null)
                {
                    // TODO FI: we need to genericize the "flags" that a
                    // field holds, and, how these flags are merged; it
                    // needs to be more "pluggable" such that if I want
                    // to have a new "thing" my Fields can do, I can
                    // easily add it
                    FieldInfo fi = fieldInfos.Add(fieldName, field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false, field.GetOmitTf());

                    fp                 = new DocFieldProcessorPerField(this, fi);
                    fp.next            = fieldHash[hashPos];
                    fieldHash[hashPos] = fp;
                    totalFieldCount++;

                    if (totalFieldCount >= fieldHash.Length / 2)
                    {
                        Rehash();
                    }
                }
                else
                {
                    fp.fieldInfo.Update(field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false, field.GetOmitTf());
                }

                if (thisFieldGen != fp.lastGen)
                {
                    // First time we're seeing this field for this doc
                    fp.fieldCount = 0;

                    if (fieldCount == fields.Length)
                    {
                        int newSize = fields.Length * 2;
                        DocFieldProcessorPerField[] newArray = new DocFieldProcessorPerField[newSize];
                        Array.Copy(fields, 0, newArray, 0, fieldCount);
                        fields = newArray;
                    }

                    fields[fieldCount++] = fp;
                    fp.lastGen           = thisFieldGen;
                }

                if (fp.fieldCount == fp.fields.Length)
                {
                    Fieldable[] newArray = new Fieldable[fp.fields.Length * 2];
                    Array.Copy(fp.fields, 0, newArray, 0, fp.fieldCount);
                    fp.fields = newArray;
                }

                fp.fields[fp.fieldCount++] = field;
                if (field.IsStored())
                {
                    fieldsWriter.AddField(field, fp.fieldInfo);
                }
            }

            // If we are writing vectors then we must visit
            // fields in sorted order so they are written in
            // sorted order.  TODO: we actually only need to
            // sort the subset of fields that have vectors
            // enabled; we could save [small amount of] CPU
            // here.
            QuickSort(fields, 0, fieldCount - 1);

            for (int i = 0; i < fieldCount; i++)
            {
                fields[i].consumer.ProcessFields(fields[i].fields, fields[i].fieldCount);
            }

            if (docState.maxTermPrefix != null && docState.infoStream != null)
            {
                docState.infoStream.WriteLine("WARNING: document contains at least one immense term (longer than the max length " + DocumentsWriter.MAX_TERM_LENGTH + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'");
                docState.maxTermPrefix = null;
            }

            DocumentsWriter.DocWriter one = fieldsWriter.FinishDocument();
            DocumentsWriter.DocWriter two = consumer.FinishDocument();
            if (one == null)
            {
                return(two);
            }
            else if (two == null)
            {
                return(one);
            }
            else
            {
                PerDoc both = GetPerDoc();
                both.docID = docState.docID;
                System.Diagnostics.Debug.Assert(one.docID == docState.docID);
                System.Diagnostics.Debug.Assert(two.docID == docState.docID);
                both.one = one;
                both.two = two;
                return(both);
            }
        }
Exemple #18
0
 public static int NumFields(Document doc)
 {
     return(doc.GetFields().Count);
 }
 public void has_correct_number_of_fields()
 {
     _document.GetFields().Count.ShouldEqual(2);
 }
			/// <summary>Initializes shared state for this new document </summary>
			internal void  Init(Document doc, int docID)
			{

                System.Diagnostics.Debug.Assert(!isIdle);
                System.Diagnostics.Debug.Assert(Enclosing_Instance.writer.TestPoint("DocumentsWriter.ThreadState.init start"));
				
				this.docID = docID;
				docBoost = doc.GetBoost();
				numStoredFields = 0;
				numFieldData = 0;
				numVectorFields = 0;
				maxTermPrefix = null;
				
				System.Diagnostics.Debug.Assert(0 == fdtLocal.Length());
				System.Diagnostics.Debug.Assert(0 == fdtLocal.GetFilePointer());
				System.Diagnostics.Debug.Assert(0 == tvfLocal.Length());
				System.Diagnostics.Debug.Assert(0 == tvfLocal.GetFilePointer());
				int thisFieldGen = fieldGen++;
				
				System.Collections.IList docFields = doc.GetFields();
				int numDocFields = docFields.Count;
				bool docHasVectors = false;
				
				// Absorb any new fields first seen in this document.
				// Also absorb any changes to fields we had already
				// seen before (eg suddenly turning on norms or
				// vectors, etc.):
				
				for (int i = 0; i < numDocFields; i++)
				{
					Fieldable field = (Fieldable) docFields[i];
					
					FieldInfo fi = Enclosing_Instance.fieldInfos.Add(field.Name(), field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false);
					if (fi.isIndexed && !fi.omitNorms)
					{
						// Maybe grow our buffered norms
						if (Enclosing_Instance.norms.Length <= fi.number)
						{
							int newSize = (int) ((1 + fi.number) * 1.25);
							BufferedNorms[] newNorms = new BufferedNorms[newSize];
							Array.Copy(Enclosing_Instance.norms, 0, newNorms, 0, Enclosing_Instance.norms.Length);
							Enclosing_Instance.norms = newNorms;
						}
						
						if (Enclosing_Instance.norms[fi.number] == null)
							Enclosing_Instance.norms[fi.number] = new BufferedNorms();
						
						Enclosing_Instance.hasNorms = true;
					}
					
					// Make sure we have a FieldData allocated
					int hashPos = fi.name.GetHashCode() & fieldDataHashMask;
					FieldData fp = fieldDataHash[hashPos];
					while (fp != null && !fp.fieldInfo.name.Equals(fi.name))
						fp = fp.next;
					
					if (fp == null)
					{
						
						fp = new FieldData(this, fi);
						fp.next = fieldDataHash[hashPos];
						fieldDataHash[hashPos] = fp;
						
						if (numAllFieldData == allFieldDataArray.Length)
						{
							int newSize = (int) (allFieldDataArray.Length * 1.5);
							int newHashSize = fieldDataHash.Length * 2;
							
							FieldData[] newArray = new FieldData[newSize];
							FieldData[] newHashArray = new FieldData[newHashSize];
							Array.Copy(allFieldDataArray, 0, newArray, 0, numAllFieldData);
							
							// Rehash
							fieldDataHashMask = newSize - 1;
							for (int j = 0; j < fieldDataHash.Length; j++)
							{
								FieldData fp0 = fieldDataHash[j];
								while (fp0 != null)
								{
									hashPos = fp0.fieldInfo.name.GetHashCode() & fieldDataHashMask;
									FieldData nextFP0 = fp0.next;
									fp0.next = newHashArray[hashPos];
									newHashArray[hashPos] = fp0;
									fp0 = nextFP0;
								}
							}
							
							allFieldDataArray = newArray;
							fieldDataHash = newHashArray;
						}
						allFieldDataArray[numAllFieldData++] = fp;
					}
					else
					{
						System.Diagnostics.Debug.Assert(fp.fieldInfo == fi);
					}
					
					if (thisFieldGen != fp.lastGen)
					{
						
						// First time we're seeing this field for this doc
						fp.lastGen = thisFieldGen;
						fp.fieldCount = 0;
						fp.doVectors = fp.doVectorPositions = fp.doVectorOffsets = false;
						fp.doNorms = fi.isIndexed && !fi.omitNorms;
						
						if (numFieldData == fieldDataArray.Length)
						{
							int newSize = fieldDataArray.Length * 2;
							FieldData[] newArray = new FieldData[newSize];
							Array.Copy(fieldDataArray, 0, newArray, 0, numFieldData);
							fieldDataArray = newArray;
						}
						fieldDataArray[numFieldData++] = fp;
					}
					
					if (field.IsTermVectorStored())
					{
						if (!fp.doVectors && numVectorFields++ == vectorFieldPointers.Length)
						{
							int newSize = (int) (numVectorFields * 1.5);
							vectorFieldPointers = new long[newSize];
							vectorFieldNumbers = new int[newSize];
						}
						fp.doVectors = true;
						docHasVectors = true;
						
						fp.doVectorPositions |= field.IsStorePositionWithTermVector();
						fp.doVectorOffsets |= field.IsStoreOffsetWithTermVector();
					}
					
					if (fp.fieldCount == fp.docFields.Length)
					{
						Fieldable[] newArray = new Fieldable[fp.docFields.Length * 2];
						Array.Copy(fp.docFields, 0, newArray, 0, fp.docFields.Length);
						fp.docFields = newArray;
					}
					
					// Lazily allocate arrays for postings:
					if (field.IsIndexed() && fp.postingsHash == null)
						fp.InitPostingArrays();
					
					fp.docFields[fp.fieldCount++] = field;
				}
				
				// Maybe init the local & global fieldsWriter
				if (localFieldsWriter == null)
				{
					if (Enclosing_Instance.fieldsWriter == null)
					{
						System.Diagnostics.Debug.Assert(Enclosing_Instance.docStoreSegment == null);
						System.Diagnostics.Debug.Assert(Enclosing_Instance.segment != null);
						Enclosing_Instance.docStoreSegment = Enclosing_Instance.segment;
						// If we hit an exception while init'ing the
						// fieldsWriter, we must abort this segment
						// because those files will be in an unknown
						// state:
						try
						{
							Enclosing_Instance.fieldsWriter = new FieldsWriter(Enclosing_Instance.directory, Enclosing_Instance.docStoreSegment, Enclosing_Instance.fieldInfos);
						}
						catch (System.Exception t)
						{
							throw new AbortException(t, Enclosing_Instance);
						}
						Enclosing_Instance.files = null;
					}
					localFieldsWriter = new FieldsWriter(null, fdtLocal, Enclosing_Instance.fieldInfos);
				}
				
				// First time we see a doc that has field(s) with
				// stored vectors, we init our tvx writer
				if (docHasVectors)
				{
					if (Enclosing_Instance.tvx == null)
					{
						System.Diagnostics.Debug.Assert(Enclosing_Instance.docStoreSegment != null);
						// If we hit an exception while init'ing the term
						// vector output files, we must abort this segment
						// because those files will be in an unknown
						// state:
						try
						{
							Enclosing_Instance.tvx = Enclosing_Instance.directory.CreateOutput(Enclosing_Instance.docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
							Enclosing_Instance.tvx.WriteInt(TermVectorsReader.FORMAT_VERSION);
							Enclosing_Instance.tvd = Enclosing_Instance.directory.CreateOutput(Enclosing_Instance.docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
							Enclosing_Instance.tvd.WriteInt(TermVectorsReader.FORMAT_VERSION);
							Enclosing_Instance.tvf = Enclosing_Instance.directory.CreateOutput(Enclosing_Instance.docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
							Enclosing_Instance.tvf.WriteInt(TermVectorsReader.FORMAT_VERSION);
							
							// We must "catch up" for all docs before us
							// that had no vectors:
							for (int i = 0; i < Enclosing_Instance.numDocsInStore; i++)
							{
								Enclosing_Instance.tvx.WriteLong(Enclosing_Instance.tvd.GetFilePointer());
								Enclosing_Instance.tvd.WriteVInt(0);
							}
						}
						catch (System.Exception t)
						{
							throw new AbortException(t, Enclosing_Instance);
						}
						Enclosing_Instance.files = null;
					}
					
					numVectorFields = 0;
				}
			}
Exemple #21
0
		public virtual void  TestBinaryFields()
		{
			Directory dir = new RAMDirectory();
			byte[] bin = new byte[]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
			
			IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
			
			for (int i = 0; i < 10; i++)
			{
				AddDoc(writer, "document number " + (i + 1));
				AddDocumentWithFields(writer);
				AddDocumentWithDifferentFields(writer);
				AddDocumentWithTermVectorFields(writer);
			}
			writer.Close();
			writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false, IndexWriter.MaxFieldLength.LIMITED);
			Document doc = new Document();
			doc.Add(new Field("bin1", bin, Field.Store.YES));
			doc.Add(new Field("bin2", bin, Field.Store.COMPRESS));
			doc.Add(new Field("junk", "junk text", Field.Store.NO, Field.Index.ANALYZED));
			writer.AddDocument(doc);
			writer.Close();
			IndexReader reader = IndexReader.Open(dir);
			doc = reader.Document(reader.MaxDoc() - 1);
			Field[] fields = doc.GetFields("bin1");
			Assert.IsNotNull(fields);
			Assert.AreEqual(1, fields.Length);
			Field b1 = fields[0];
			Assert.IsTrue(b1.IsBinary());
			byte[] data1 = b1.GetBinaryValue();
			Assert.AreEqual(bin.Length, b1.GetBinaryLength());
			for (int i = 0; i < bin.Length; i++)
			{
				Assert.AreEqual(bin[i], data1[i + b1.GetBinaryOffset()]);
			}
			fields = doc.GetFields("bin2");
			Assert.IsNotNull(fields);
			Assert.AreEqual(1, fields.Length);
			b1 = fields[0];
			Assert.IsTrue(b1.IsBinary());
			data1 = b1.GetBinaryValue();
			Assert.AreEqual(bin.Length, b1.GetBinaryLength());
			for (int i = 0; i < bin.Length; i++)
			{
				Assert.AreEqual(bin[i], data1[i + b1.GetBinaryOffset()]);
			}
			System.Collections.Hashtable lazyFields = new System.Collections.Hashtable();
			SupportClass.CollectionsHelper.AddIfNotContains(lazyFields, "bin1");
			FieldSelector sel = new SetBasedFieldSelector(new System.Collections.Hashtable(), lazyFields);
			doc = reader.Document(reader.MaxDoc() - 1, sel);
			Fieldable[] fieldables = doc.GetFieldables("bin1");
			Assert.IsNotNull(fieldables);
			Assert.AreEqual(1, fieldables.Length);
			Fieldable fb1 = fieldables[0];
			Assert.IsTrue(fb1.IsBinary());
			Assert.AreEqual(bin.Length, fb1.GetBinaryLength());
			data1 = fb1.GetBinaryValue();
			Assert.AreEqual(bin.Length, fb1.GetBinaryLength());
			for (int i = 0; i < bin.Length; i++)
			{
				Assert.AreEqual(bin[i], data1[i + fb1.GetBinaryOffset()]);
			}
			reader.Close();
			// force optimize
			
			
			writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false, IndexWriter.MaxFieldLength.LIMITED);
			writer.Optimize();
			writer.Close();
			reader = IndexReader.Open(dir);
			doc = reader.Document(reader.MaxDoc() - 1);
			fields = doc.GetFields("bin1");
			Assert.IsNotNull(fields);
			Assert.AreEqual(1, fields.Length);
			b1 = fields[0];
			Assert.IsTrue(b1.IsBinary());
			data1 = b1.GetBinaryValue();
			Assert.AreEqual(bin.Length, b1.GetBinaryLength());
			for (int i = 0; i < bin.Length; i++)
			{
				Assert.AreEqual(bin[i], data1[i + b1.GetBinaryOffset()]);
			}
			fields = doc.GetFields("bin2");
			Assert.IsNotNull(fields);
			Assert.AreEqual(1, fields.Length);
			b1 = fields[0];
			Assert.IsTrue(b1.IsBinary());
			data1 = b1.GetBinaryValue();
			Assert.AreEqual(bin.Length, b1.GetBinaryLength());
			for (int i = 0; i < bin.Length; i++)
			{
				Assert.AreEqual(bin[i], data1[i + b1.GetBinaryOffset()]);
			}
			reader.Close();
		}
Exemple #22
0
		public virtual void  TestStoredFieldsOrder()
		{
			Directory d = new MockRAMDirectory();
			IndexWriter w = new IndexWriter(d, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
			Document doc = new Document();
			doc.Add(new Field("zzz", "a b c", Field.Store.YES, Field.Index.NO));
			doc.Add(new Field("aaa", "a b c", Field.Store.YES, Field.Index.NO));
			doc.Add(new Field("zzz", "1 2 3", Field.Store.YES, Field.Index.NO));
			w.AddDocument(doc);
			IndexReader r = w.GetReader();
			doc = r.Document(0);
			System.Collections.IEnumerator it = doc.GetFields().GetEnumerator();
			Assert.IsTrue(it.MoveNext());
			Field f = (Field) it.Current;
			Assert.AreEqual(f.Name(), "zzz");
			Assert.AreEqual(f.StringValue(), "a b c");
			
			Assert.IsTrue(it.MoveNext());
			f = (Field) it.Current;
			Assert.AreEqual(f.Name(), "aaa");
			Assert.AreEqual(f.StringValue(), "a b c");
			
			Assert.IsTrue(it.MoveNext());
			f = (Field) it.Current;
			Assert.AreEqual(f.Name(), "zzz");
			Assert.AreEqual(f.StringValue(), "1 2 3");
			Assert.IsFalse(it.MoveNext());
			r.Close();
			w.Close();
			d.Close();
		}
Exemple #23
0
        internal void  AddDocument(Document doc)
        {
            indexStream.WriteLong(fieldsStream.GetFilePointer());

            int storedCount = 0;

            System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator();
            while (fieldIterator.MoveNext())
            {
                Fieldable field = (Fieldable)fieldIterator.Current;
                if (field.IsStored())
                {
                    storedCount++;
                }
            }
            fieldsStream.WriteVInt(storedCount);

            fieldIterator = doc.GetFields().GetEnumerator();
            while (fieldIterator.MoveNext())
            {
                Fieldable field = (Fieldable)fieldIterator.Current;
                // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
                // and field.binaryValue() already returns the compressed value for a field
                // with isCompressed()==true, so we disable compression in that case
                bool disableCompression = (field is FieldsReader.FieldForMerge);
                if (field.IsStored())
                {
                    fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name()));

                    byte bits = 0;
                    if (field.IsTokenized())
                    {
                        bits |= FieldsWriter.FIELD_IS_TOKENIZED;
                    }
                    if (field.IsBinary())
                    {
                        bits |= FieldsWriter.FIELD_IS_BINARY;
                    }
                    if (field.IsCompressed())
                    {
                        bits |= FieldsWriter.FIELD_IS_COMPRESSED;
                    }

                    fieldsStream.WriteByte(bits);

                    if (field.IsCompressed())
                    {
                        // compression is enabled for the current field
                        byte[] data = null;

                        if (disableCompression)
                        {
                            // optimized case for merging, the data
                            // is already compressed
                            data = field.BinaryValue();
                        }
                        else
                        {
                            // check if it is a binary field
                            if (field.IsBinary())
                            {
                                data = Compress(field.BinaryValue());
                            }
                            else
                            {
                                data = Compress(System.Text.Encoding.GetEncoding("UTF-8").GetBytes(field.StringValue()));
                            }
                        }
                        int len = data.Length;
                        fieldsStream.WriteVInt(len);
                        fieldsStream.WriteBytes(data, len);
                    }
                    else
                    {
                        // compression is disabled for the current field
                        if (field.IsBinary())
                        {
                            byte[] data = field.BinaryValue();
                            int    len  = data.Length;
                            fieldsStream.WriteVInt(len);
                            fieldsStream.WriteBytes(data, len);
                        }
                        else
                        {
                            fieldsStream.WriteString(field.StringValue());
                        }
                    }
                }
            }
        }
		internal void  AddDocument(Document doc)
		{
			indexStream.WriteLong(fieldsStream.GetFilePointer());
			
			int storedCount = 0;
			System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator();
			while (fieldIterator.MoveNext())
			{
				Fieldable field = (Fieldable) fieldIterator.Current;
				if (field.IsStored())
					storedCount++;
			}
			fieldsStream.WriteVInt(storedCount);
			
			fieldIterator = doc.GetFields().GetEnumerator();
			while (fieldIterator.MoveNext())
			{
				Fieldable field = (Fieldable) fieldIterator.Current;
				if (field.IsStored())
					WriteField(fieldInfos.FieldInfo(field.Name()), field);
			}
		}
Exemple #25
0
		// Tokenizes the fields of a document into Postings.
		private void  InvertDocument(Document doc)
		{
			System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator();
			while (fieldIterator.MoveNext())
			{
				Fieldable field = (Fieldable) fieldIterator.Current;
				System.String fieldName = field.Name();
				int fieldNumber = fieldInfos.FieldNumber(fieldName);
				
				int length = fieldLengths[fieldNumber]; // length of field
				int position = fieldPositions[fieldNumber]; // position in field
				if (length > 0)
					position += analyzer.GetPositionIncrementGap(fieldName);
				int offset = fieldOffsets[fieldNumber]; // offset field
				
				if (field.IsIndexed())
				{
					if (!field.IsTokenized())
					{
						// un-tokenized field
						System.String stringValue = field.StringValue();
						if (field.IsStoreOffsetWithTermVector())
							AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length));
						else
							AddPosition(fieldName, stringValue, position++, null);
						offset += stringValue.Length;
						length++;
					}
					else
					{
						System.IO.TextReader reader; // find or make Reader
						if (field.ReaderValue() != null)
							reader = field.ReaderValue();
						else if (field.StringValue() != null)
							reader = new System.IO.StringReader(field.StringValue());
						else
							throw new System.ArgumentException("field must have either String or Reader value");
						
						// Tokenize field and add to postingTable
						TokenStream stream = analyzer.TokenStream(fieldName, reader);
						try
						{
							Token lastToken = null;
							for (Token t = stream.Next(); t != null; t = stream.Next())
							{
								position += (t.GetPositionIncrement() - 1);
								
								if (field.IsStoreOffsetWithTermVector())
									AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset()));
								else
									AddPosition(fieldName, t.TermText(), position++, null);
								
								lastToken = t;
								if (++length >= maxFieldLength)
								{
									if (infoStream != null)
										infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens");
									break;
								}
							}
							
							if (lastToken != null)
								offset += lastToken.EndOffset() + 1;
						}
						finally
						{
							stream.Close();
						}
					}
					
					fieldLengths[fieldNumber] = length; // save field length
					fieldPositions[fieldNumber] = position; // save field position
					fieldBoosts[fieldNumber] *= field.GetBoost();
					fieldOffsets[fieldNumber] = offset;
				}
			}
		}
        // Tokenizes the fields of a document into Postings.
        private void  InvertDocument(Document doc)
        {
            System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator();
            while (fieldIterator.MoveNext())
            {
                Fieldable     field       = (Fieldable)fieldIterator.Current;
                System.String fieldName   = field.Name();
                int           fieldNumber = fieldInfos.FieldNumber(fieldName);

                int length   = fieldLengths[fieldNumber];               // length of field
                int position = fieldPositions[fieldNumber];             // position in field
                if (length > 0)
                {
                    position += analyzer.GetPositionIncrementGap(fieldName);
                }
                int offset = fieldOffsets[fieldNumber];                 // offset field

                if (field.IsIndexed())
                {
                    if (!field.IsTokenized())
                    {
                        // un-tokenized field
                        System.String stringValue = field.StringValue();
                        if (field.IsStoreOffsetWithTermVector())
                        {
                            AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length));
                        }
                        else
                        {
                            AddPosition(fieldName, stringValue, position++, null);
                        }
                        offset += stringValue.Length;
                        length++;
                    }
                    else
                    {
                        System.IO.TextReader reader;                         // find or make Reader
                        if (field.ReaderValue() != null)
                        {
                            reader = field.ReaderValue();
                        }
                        else if (field.StringValue() != null)
                        {
                            reader = new System.IO.StringReader(field.StringValue());
                        }
                        else
                        {
                            throw new System.ArgumentException("field must have either String or Reader value");
                        }

                        // Tokenize field and add to postingTable
                        TokenStream stream = analyzer.TokenStream(fieldName, reader);
                        try
                        {
                            Token lastToken = null;
                            for (Token t = stream.Next(); t != null; t = stream.Next())
                            {
                                position += (t.GetPositionIncrement() - 1);

                                if (field.IsStoreOffsetWithTermVector())
                                {
                                    AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset()));
                                }
                                else
                                {
                                    AddPosition(fieldName, t.TermText(), position++, null);
                                }

                                lastToken = t;
                                if (++length >= maxFieldLength)
                                {
                                    if (infoStream != null)
                                    {
                                        infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens");
                                    }
                                    break;
                                }
                            }

                            if (lastToken != null)
                            {
                                offset += lastToken.EndOffset() + 1;
                            }
                        }
                        finally
                        {
                            stream.Close();
                        }
                    }

                    fieldLengths[fieldNumber]   = length;                   // save field length
                    fieldPositions[fieldNumber] = position;                 // save field position
                    fieldBoosts[fieldNumber]   *= field.GetBoost();
                    fieldOffsets[fieldNumber]   = offset;
                }
            }
        }