Esempio n. 1
0
        private void  MergeTerms()
        {
            try
            {
                freqOutput      = directory.CreateOutput(segment + ".frq");
                proxOutput      = directory.CreateOutput(segment + ".prx");
                termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
                skipInterval    = termInfosWriter.skipInterval;
                maxSkipLevels   = termInfosWriter.maxSkipLevels;
                skipListWriter  = new DefaultSkipListWriter(skipInterval, maxSkipLevels, mergedDocs, freqOutput, proxOutput);
                queue           = new SegmentMergeQueue(readers.Count);

                MergeTermInfos();
            }
            finally
            {
                if (freqOutput != null)
                {
                    freqOutput.Close();
                }
                if (proxOutput != null)
                {
                    proxOutput.Close();
                }
                if (termInfosWriter != null)
                {
                    termInfosWriter.Close();
                }
                if (queue != null)
                {
                    queue.Close();
                }
            }
        }
 internal FormatPostingsTermsWriter(SegmentWriteState state, FormatPostingsFieldsWriter parent)
     : base()
 {
     this.parent = parent;
     termsOut = parent.termsOut;
     docsWriter = new FormatPostingsDocsWriter(state, this);
 }
        public FormatPostingsFieldsWriter(SegmentWriteState state, FieldInfos fieldInfos) : base()
        {
            dir             = state.directory;
            segment         = state.segmentName;
            totalNumDocs    = state.numDocs;
            this.fieldInfos = fieldInfos;
            termsOut        = new TermInfosWriter(dir, segment, fieldInfos, state.termIndexInterval);

            // TODO: this is a nasty abstraction violation (that we
            // peek down to find freqOut/proxOut) -- we need a
            // better abstraction here whereby these child consumers
            // can provide skip data or not
            skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, totalNumDocs, null, null);

            state.flushedFiles.Add(state.SegmentFileName(IndexFileNames.TERMS_EXTENSION));
            state.flushedFiles.Add(state.SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION));

            termsWriter = new FormatPostingsTermsWriter(state, this);
        }
		public FormatPostingsFieldsWriter(SegmentWriteState state, FieldInfos fieldInfos):base()
		{
			
			dir = state.directory;
			segment = state.segmentName;
			totalNumDocs = state.numDocs;
			this.fieldInfos = fieldInfos;
			termsOut = new TermInfosWriter(dir, segment, fieldInfos, state.termIndexInterval);
			
			// TODO: this is a nasty abstraction violation (that we
			// peek down to find freqOut/proxOut) -- we need a
			// better abstraction here whereby these child consumers
			// can provide skip data or not
			skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, totalNumDocs, null, null);
			
			SupportClass.CollectionsHelper.AddIfNotContains(state.flushedFiles, state.SegmentFileName(IndexFileNames.TERMS_EXTENSION));
			SupportClass.CollectionsHelper.AddIfNotContains(state.flushedFiles, state.SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION));
			
			termsWriter = new FormatPostingsTermsWriter(state, this);
		}
Esempio n. 5
0
		private void  WritePostings(Posting[] postings, System.String segment)
		{
			IndexOutput freq = null, prox = null;
			TermInfosWriter tis = null;
			TermVectorsWriter termVectorWriter = null;
			try
			{
				//open files for inverse index storage
				freq = directory.CreateOutput(segment + ".frq");
				prox = directory.CreateOutput(segment + ".prx");
				tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
				TermInfo ti = new TermInfo();
				System.String currentField = null;
				
				for (int i = 0; i < postings.Length; i++)
				{
					Posting posting = postings[i];
					
					// add an entry to the dictionary with pointers to prox and freq files
					ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), - 1);
					tis.Add(posting.term, ti);
					
					// add an entry to the freq file
					int postingFreq = posting.freq;
					if (postingFreq == 1)
					// optimize freq=1
						freq.WriteVInt(1);
					// set low bit of doc num.
					else
					{
						freq.WriteVInt(0); // the document number
						freq.WriteVInt(postingFreq); // frequency in doc
					}
					
					int lastPosition = 0; // write positions
					int[] positions = posting.positions;
					for (int j = 0; j < postingFreq; j++)
					{
						// use delta-encoding
						int position = positions[j];
						prox.WriteVInt(position - lastPosition);
						lastPosition = position;
					}
					// check to see if we switched to a new field
					System.String termField = posting.term.Field();
					if (currentField != termField)
					{
						// changing field - see if there is something to save
						currentField = termField;
						FieldInfo fi = fieldInfos.FieldInfo(currentField);
						if (fi.storeTermVector)
						{
							if (termVectorWriter == null)
							{
								termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
								termVectorWriter.OpenDocument();
							}
							termVectorWriter.OpenField(currentField);
						}
						else if (termVectorWriter != null)
						{
							termVectorWriter.CloseField();
						}
					}
					if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
					{
						termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets);
					}
				}
				if (termVectorWriter != null)
					termVectorWriter.CloseDocument();
			}
			finally
			{
				// make an effort to close all streams we can but remember and re-throw
				// the first exception encountered in this process
				System.IO.IOException keep = null;
				if (freq != null)
					try
					{
						freq.Close();
					}
					catch (System.IO.IOException e)
					{
						if (keep == null)
							keep = e;
					}
				if (prox != null)
					try
					{
						prox.Close();
					}
					catch (System.IO.IOException e)
					{
						if (keep == null)
							keep = e;
					}
				if (tis != null)
					try
					{
						tis.Close();
					}
					catch (System.IO.IOException e)
					{
						if (keep == null)
							keep = e;
					}
				if (termVectorWriter != null)
					try
					{
						termVectorWriter.Close();
					}
					catch (System.IO.IOException e)
					{
						if (keep == null)
							keep = e;
					}
				if (keep != null)
				{
					throw new System.IO.IOException(keep.StackTrace);
				}
			}
		}
Esempio n. 6
0
 internal TermInfosWriter(Directory directory, System.String segment, FieldInfos fis, int interval, IState state)
 {
     Initialize(directory, segment, fis, interval, false, state);
     other       = new TermInfosWriter(directory, segment, fis, interval, true, state);
     other.other = this;
 }
Esempio n. 7
0
 public /*internal*/ TermInfosWriter(Directory directory, System.String segment, FieldInfos fis, int interval)
 {
     Initialize(directory, segment, fis, interval, false);
     other       = new TermInfosWriter(directory, segment, fis, interval, true);
     other.other = this;
 }
 internal FormatPostingsTermsWriter(SegmentWriteState state, FormatPostingsFieldsWriter parent) : base()
 {
     this.parent = parent;
     termsOut    = parent.termsOut;
     docsWriter  = new FormatPostingsDocsWriter(state, this);
 }
 /*internal*/
 public TermInfosWriter(Directory directory, System.String segment, FieldInfos fis, int interval)
 {
     Initialize(directory, segment, fis, interval, false);
     other = new TermInfosWriter(directory, segment, fis, interval, true);
     other.other = this;
 }
        // TODO: would be nice to factor out morme of this, eg the
        // FreqProxFieldMergeState, and code to visit all Fields
        // under the same FieldInfo together, up into TermsHash*.
        // Other writers would presumably share alot of this...
        internal override void flush(IDictionary<object, object> threadsAndFields, DocumentsWriter.FlushState state)
        {
            // Gather all FieldData's that have postings, across all
            // ThreadStates
            List<object> allFields = new List<object>();

            IEnumerator<KeyValuePair<object, object>> it = threadsAndFields.GetEnumerator();
            while (it.MoveNext())
            {

                KeyValuePair<object, object> entry = (KeyValuePair<object, object>)it.Current;

                ICollection<object> fields = (ICollection<object>)entry.Value;

                IEnumerator<object> fieldsIt = fields.GetEnumerator();

                while (fieldsIt.MoveNext())
                {
                    FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField)fieldsIt.Current;
                    if (perField.termsHashPerField.numPostings > 0)
                        allFields.Add(perField);
                }
            }

            // Sort by field name
            allFields.Sort();
            int numAllFields = allFields.Count;

            TermInfosWriter termsOut = new TermInfosWriter(state.directory,
                                                                 state.segmentName,
                                                                 fieldInfos,
                                                                 state.docWriter.writer.GetTermIndexInterval());

            IndexOutput freqOut = state.directory.CreateOutput(state.SegmentFileName(IndexFileNames.FREQ_EXTENSION));
            IndexOutput proxOut;

            if (fieldInfos.HasProx())
                proxOut = state.directory.CreateOutput(state.SegmentFileName(IndexFileNames.PROX_EXTENSION));
            else
                proxOut = null;

            DefaultSkipListWriter skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval,
                                                                                   termsOut.maxSkipLevels,
                                                                                   state.numDocsInRAM, freqOut, proxOut);

            int start = 0;
            while (start < numAllFields)
            {
                FieldInfo fieldInfo = ((FreqProxTermsWriterPerField)allFields[start]).fieldInfo;
                string fieldName = fieldInfo.name;

                int end = start + 1;
                while (end < numAllFields && ((FreqProxTermsWriterPerField)allFields[end]).fieldInfo.name.Equals(fieldName))
                    end++;

                FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end - start];
                for (int i = start; i < end; i++)
                {
                    fields[i - start] = (FreqProxTermsWriterPerField)allFields[i];

                    // Aggregate the storePayload as seen by the same
                    // field across multiple threads
                    fieldInfo.storePayloads |= fields[i - start].hasPayloads;
                }

                // If this field has postings then add them to the
                // segment
                AppendPostings(state, fields, termsOut, freqOut, proxOut, skipListWriter);

                for (int i = 0; i < fields.Length; i++)
                {
                    TermsHashPerField perField = fields[i].termsHashPerField;
                    int numPostings = perField.numPostings;
                    perField.reset();
                    perField.shrinkHash(numPostings);
                    fields[i].reset();
                }

                start = end;
            }

            it = threadsAndFields.GetEnumerator();
            while (it.MoveNext())
            {
                KeyValuePair<object, object> entry = (KeyValuePair<object, object>)it.Current;
                FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread)entry.Key;
                perThread.termsHashPerThread.reset(true);
            }

            freqOut.Close();
            if (proxOut != null)
            {
                state.flushedFiles[state.SegmentFileName(IndexFileNames.PROX_EXTENSION)] = state.SegmentFileName(IndexFileNames.PROX_EXTENSION);
                proxOut.Close();
            }
            termsOut.Close();

            // Record all files we have flushed
            state.flushedFiles[state.SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION)] = state.SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION);
            state.flushedFiles[state.SegmentFileName(IndexFileNames.FREQ_EXTENSION)] = state.SegmentFileName(IndexFileNames.FREQ_EXTENSION);
            state.flushedFiles[state.SegmentFileName(IndexFileNames.TERMS_EXTENSION)] = state.SegmentFileName(IndexFileNames.TERMS_EXTENSION);
            state.flushedFiles[state.SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)] = state.SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION);
        }
        // TODO: would be nice to factor out morme of this, eg the
        // FreqProxFieldMergeState, and code to visit all Fields
        // under the same FieldInfo together, up into TermsHash*.
        // Other writers would presumably share alot of this...

        internal override void flush(IDictionary <object, object> threadsAndFields, DocumentsWriter.FlushState state)
        {
            // Gather all FieldData's that have postings, across all
            // ThreadStates
            List <object> allFields = new List <object>();

            IEnumerator <KeyValuePair <object, object> > it = threadsAndFields.GetEnumerator();

            while (it.MoveNext())
            {
                KeyValuePair <object, object> entry = (KeyValuePair <object, object>)it.Current;

                ICollection <object> fields = (ICollection <object>)entry.Value;

                IEnumerator <object> fieldsIt = fields.GetEnumerator();

                while (fieldsIt.MoveNext())
                {
                    FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField)fieldsIt.Current;
                    if (perField.termsHashPerField.numPostings > 0)
                    {
                        allFields.Add(perField);
                    }
                }
            }

            // Sort by field name
            allFields.Sort();
            int numAllFields = allFields.Count;

            TermInfosWriter termsOut = new TermInfosWriter(state.directory,
                                                           state.segmentName,
                                                           fieldInfos,
                                                           state.docWriter.writer.GetTermIndexInterval());

            IndexOutput freqOut = state.directory.CreateOutput(state.SegmentFileName(IndexFileNames.FREQ_EXTENSION));
            IndexOutput proxOut;

            if (fieldInfos.HasProx())
            {
                proxOut = state.directory.CreateOutput(state.SegmentFileName(IndexFileNames.PROX_EXTENSION));
            }
            else
            {
                proxOut = null;
            }

            DefaultSkipListWriter skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval,
                                                                             termsOut.maxSkipLevels,
                                                                             state.numDocsInRAM, freqOut, proxOut);

            int start = 0;

            while (start < numAllFields)
            {
                FieldInfo fieldInfo = ((FreqProxTermsWriterPerField)allFields[start]).fieldInfo;
                string    fieldName = fieldInfo.name;

                int end = start + 1;
                while (end < numAllFields && ((FreqProxTermsWriterPerField)allFields[end]).fieldInfo.name.Equals(fieldName))
                {
                    end++;
                }

                FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end - start];
                for (int i = start; i < end; i++)
                {
                    fields[i - start] = (FreqProxTermsWriterPerField)allFields[i];

                    // Aggregate the storePayload as seen by the same
                    // field across multiple threads
                    fieldInfo.storePayloads |= fields[i - start].hasPayloads;
                }

                // If this field has postings then add them to the
                // segment
                AppendPostings(state, fields, termsOut, freqOut, proxOut, skipListWriter);

                for (int i = 0; i < fields.Length; i++)
                {
                    TermsHashPerField perField = fields[i].termsHashPerField;
                    int numPostings            = perField.numPostings;
                    perField.reset();
                    perField.shrinkHash(numPostings);
                    fields[i].reset();
                }

                start = end;
            }

            it = threadsAndFields.GetEnumerator();
            while (it.MoveNext())
            {
                KeyValuePair <object, object> entry     = (KeyValuePair <object, object>)it.Current;
                FreqProxTermsWriterPerThread  perThread = (FreqProxTermsWriterPerThread)entry.Key;
                perThread.termsHashPerThread.reset(true);
            }

            freqOut.Close();
            if (proxOut != null)
            {
                state.flushedFiles[state.SegmentFileName(IndexFileNames.PROX_EXTENSION)] = state.SegmentFileName(IndexFileNames.PROX_EXTENSION);
                proxOut.Close();
            }
            termsOut.Close();

            // Record all files we have flushed
            state.flushedFiles[state.SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION)] = state.SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION);
            state.flushedFiles[state.SegmentFileName(IndexFileNames.FREQ_EXTENSION)]        = state.SegmentFileName(IndexFileNames.FREQ_EXTENSION);
            state.flushedFiles[state.SegmentFileName(IndexFileNames.TERMS_EXTENSION)]       = state.SegmentFileName(IndexFileNames.TERMS_EXTENSION);
            state.flushedFiles[state.SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)] = state.SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION);
        }
Esempio n. 12
0
        private void MergeTerms()
        {
            try
            {
                freqOutput = directory.CreateOutput(segment + ".frq");
                proxOutput = directory.CreateOutput(segment + ".prx");
                termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
                skipInterval = termInfosWriter.skipInterval;
                queue = new SegmentMergeQueue(readers.Count);

                MergeTermInfos();
            }
            finally
            {
                if (freqOutput != null)
                    freqOutput.Close();
                if (proxOutput != null)
                    proxOutput.Close();
                if (termInfosWriter != null)
                    termInfosWriter.Close();
                if (queue != null)
                    queue.Close();
            }
        }
        /* Walk through all unique text tokens (Posting
         * instances) found in this field and serialize them
         * into a single RAM segment. */
        void AppendPostings(DocumentsWriter.FlushState flushState,
                            FreqProxTermsWriterPerField[] fields,
                            TermInfosWriter termsOut,
                            IndexOutput freqOut,
                            IndexOutput proxOut,
                            DefaultSkipListWriter skipListWriter)
        {
            int fieldNumber = fields[0].fieldInfo.number;
            int numFields = fields.Length;

            FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];

            for (int i = 0; i < numFields; i++)
            {
                FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);

                System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo);

                // Should always be true
                bool result = fms.nextTerm();
                System.Diagnostics.Debug.Assert(result);
            }

            int skipInterval = termsOut.skipInterval;
            bool currentFieldOmitTf = fields[0].fieldInfo.omitTf;

            // If current field omits tf then it cannot store
            // payloads.  We silently drop the payloads in this case:
            bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads;

            FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];

            while (numFields > 0)
            {

                // Get the next term to merge
                termStates[0] = mergeStates[0];
                int numToMerge = 1;

                for (int i = 1; i < numFields; i++)
                {
                    char[] text = mergeStates[i].text;
                    int textOffset = mergeStates[i].textOffset;
                    int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);

                    if (cmp < 0)
                    {
                        termStates[0] = mergeStates[i];
                        numToMerge = 1;
                    }
                    else if (cmp == 0)
                        termStates[numToMerge++] = mergeStates[i];
                }

                int df = 0;
                int lastPayloadLength = -1;

                int lastDoc = 0;

                char[] text_Renamed = termStates[0].text;
                int start = termStates[0].textOffset;

                long freqPointer = freqOut.GetFilePointer();
                long proxPointer;
                if (proxOut != null)
                    proxPointer = proxOut.GetFilePointer();
                else
                    proxPointer = 0;

                skipListWriter.ResetSkip();

                // Now termStates has numToMerge FieldMergeStates
                // which all share the same term.  Now we must
                // interleave the docID streams.
                while (numToMerge > 0)
                {

                    if ((++df % skipInterval) == 0)
                    {
                        skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);
                        skipListWriter.BufferSkip(df);
                    }

                    FreqProxFieldMergeState minState = termStates[0];
                    for (int i = 1; i < numToMerge; i++)
                        if (termStates[i].docID < minState.docID)
                            minState = termStates[i];

                    int doc = minState.docID;
                    int termDocFreq = minState.termFreq;

                    System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM);
                    System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1);

                    ByteSliceReader prox = minState.prox;

                    // Carefully copy over the prox + payload info,
                    // changing the format to match Lucene's segment
                    // format.
                    if (!currentFieldOmitTf)
                    {
                        // omitTf == false so we do write positions & payload
                        System.Diagnostics.Debug.Assert(proxOut != null);
                        for (int j = 0; j < termDocFreq; j++)
                        {
                            int code = prox.ReadVInt();
                            if (currentFieldStorePayloads)
                            {
                                int payloadLength;
                                if ((code & 1) != 0)
                                {
                                    // This position has a payload
                                    payloadLength = prox.ReadVInt();
                                }
                                else
                                    payloadLength = 0;
                                if (payloadLength != lastPayloadLength)
                                {
                                    proxOut.WriteVInt(code | 1);
                                    proxOut.WriteVInt(payloadLength);
                                    lastPayloadLength = payloadLength;
                                }
                                else
                                    proxOut.WriteVInt(code & (~1));
                                if (payloadLength > 0)
                                    copyBytes(prox, proxOut, payloadLength);
                            }
                            else
                            {
                                System.Diagnostics.Debug.Assert(0 == (code & 1));
                                proxOut.WriteVInt(code >> 1);
                            }
                        } //End for

                        int newDocCode = (doc - lastDoc) << 1;

                        if (1 == termDocFreq)
                        {
                            freqOut.WriteVInt(newDocCode | 1);
                        }
                        else
                        {
                            freqOut.WriteVInt(newDocCode);
                            freqOut.WriteVInt(termDocFreq);
                        }
                    }
                    else
                    {
                        // omitTf==true: we store only the docs, without
                        // term freq, positions, payloads
                        freqOut.WriteVInt(doc - lastDoc);
                    }

                    lastDoc = doc;

                    if (!minState.nextDoc())
                    {

                        // Remove from termStates
                        int upto = 0;
                        for (int i = 0; i < numToMerge; i++)
                            if (termStates[i] != minState)
                                termStates[upto++] = termStates[i];
                        numToMerge--;
                        System.Diagnostics.Debug.Assert(upto == numToMerge);

                        // Advance this state to the next term

                        if (!minState.nextTerm())
                        {
                            // OK, no more terms, so remove from mergeStates
                            // as well
                            upto = 0;
                            for (int i = 0; i < numFields; i++)
                                if (mergeStates[i] != minState)
                                    mergeStates[upto++] = mergeStates[i];
                            numFields--;
                            System.Diagnostics.Debug.Assert(upto == numFields);
                        }
                    }
                }

                System.Diagnostics.Debug.Assert(df > 0);

                // Done merging this term

                long skipPointer = skipListWriter.WriteSkip(freqOut);

                // Write term
                termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer));

                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8);

                // TODO: we could save O(n) re-scan of the term by
                // computing the shared prefix with the last term
                // while during the UTF8 encoding
                termsOut.Add(fieldNumber,
                             termsUTF8.result,
                             termsUTF8.length,
                             termInfo);
            }
        }
Esempio n. 14
0
        // FIXME: OG: remove hard-coded file names
        public static void  Test()
        {
            System.IO.FileInfo file = new System.IO.FileInfo("words.txt");
            System.Console.Out.WriteLine(" reading word file containing " + file.Length + " bytes");

            System.DateTime start = System.DateTime.Now;

            System.Collections.ArrayList keys = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
            System.IO.FileStream         ws   = new System.IO.FileStream(file.FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read);
            System.IO.StreamReader       wr   = new System.IO.StreamReader(new System.IO.StreamReader(ws, System.Text.Encoding.Default).BaseStream, new System.IO.StreamReader(ws, System.Text.Encoding.Default).CurrentEncoding);

            for (System.String key = wr.ReadLine(); key != null; key = wr.ReadLine())
            {
                keys.Add(new Term("word", key));
            }
            wr.Close();

            System.DateTime end = System.DateTime.Now;

            System.Console.Out.Write(end.Ticks - start.Ticks);
            System.Console.Out.WriteLine(" milliseconds to read " + keys.Count + " words");

            start = System.DateTime.Now;

            System.Random gen = new System.Random((System.Int32) 1251971);
            long          fp  = (gen.Next() & 0xF) + 1;
            long          pp  = (gen.Next() & 0xF) + 1;

            int[]  docFreqs     = new int[keys.Count];
            long[] freqPointers = new long[keys.Count];
            long[] proxPointers = new long[keys.Count];
            for (int i = 0; i < keys.Count; i++)
            {
                docFreqs[i]     = (gen.Next() & 0xF) + 1;
                freqPointers[i] = fp;
                proxPointers[i] = pp;
                fp += (gen.Next() & 0xF) + 1;
                ;
                pp += (gen.Next() & 0xF) + 1;
                ;
            }

            end = System.DateTime.Now;

            System.Console.Out.Write(end.Ticks - start.Ticks);
            System.Console.Out.WriteLine(" milliseconds to generate values");

            start = System.DateTime.Now;

            Directory  store = FSDirectory.GetDirectory("test.store", true);
            FieldInfos fis   = new FieldInfos();

            TermInfosWriter writer = new TermInfosWriter(store, "words", fis);

            fis.Add("word", false);

            for (int i = 0; i < keys.Count; i++)
            {
                writer.Add((Term)keys[i], new TermInfo(docFreqs[i], freqPointers[i], proxPointers[i]));
            }

            writer.Close();

            end = System.DateTime.Now;

            System.Console.Out.Write(end.Ticks - start.Ticks);
            System.Console.Out.WriteLine(" milliseconds to write table");

            System.Console.Out.WriteLine(" table occupies " + store.FileLength("words.tis") + " bytes");

            start = System.DateTime.Now;

            TermInfosReader reader = new TermInfosReader(store, "words", fis);

            end = System.DateTime.Now;

            System.Console.Out.Write(end.Ticks - start.Ticks);
            System.Console.Out.WriteLine(" milliseconds to open table");

            start = System.DateTime.Now;

            SegmentTermEnum enumerator = reader.Terms();

            for (int i = 0; i < keys.Count; i++)
            {
                enumerator.Next();
                Term key = (Term)keys[i];
                if (!key.Equals(enumerator.Term()))
                {
                    throw new System.Exception("wrong term: " + enumerator.Term() + ", expected: " + key + " at " + i);
                }
                TermInfo ti = enumerator.TermInfo();
                if (ti.docFreq != docFreqs[i])
                {
                    throw new System.Exception("wrong value: " + System.Convert.ToString(ti.docFreq, 16) + ", expected: " + System.Convert.ToString(docFreqs[i], 16) + " at " + i);
                }
                if (ti.freqPointer != freqPointers[i])
                {
                    throw new System.Exception("wrong value: " + System.Convert.ToString(ti.freqPointer, 16) + ", expected: " + System.Convert.ToString(freqPointers[i], 16) + " at " + i);
                }
                if (ti.proxPointer != proxPointers[i])
                {
                    throw new System.Exception("wrong value: " + System.Convert.ToString(ti.proxPointer, 16) + ", expected: " + System.Convert.ToString(proxPointers[i], 16) + " at " + i);
                }
            }

            end = System.DateTime.Now;

            System.Console.Out.Write(end.Ticks - start.Ticks);
            System.Console.Out.WriteLine(" milliseconds to iterate over " + keys.Count + " words");

            start = System.DateTime.Now;

            for (int i = 0; i < keys.Count; i++)
            {
                Term     key = (Term)keys[i];
                TermInfo ti  = reader.Get(key);
                if (ti.docFreq != docFreqs[i])
                {
                    throw new System.Exception("wrong value: " + System.Convert.ToString(ti.docFreq, 16) + ", expected: " + System.Convert.ToString(docFreqs[i], 16) + " at " + i);
                }
                if (ti.freqPointer != freqPointers[i])
                {
                    throw new System.Exception("wrong value: " + System.Convert.ToString(ti.freqPointer, 16) + ", expected: " + System.Convert.ToString(freqPointers[i], 16) + " at " + i);
                }
                if (ti.proxPointer != proxPointers[i])
                {
                    throw new System.Exception("wrong value: " + System.Convert.ToString(ti.proxPointer, 16) + ", expected: " + System.Convert.ToString(proxPointers[i], 16) + " at " + i);
                }
            }

            end = System.DateTime.Now;

            System.Console.Out.Write((end.Ticks - start.Ticks) / (float)keys.Count);
            System.Console.Out.WriteLine(" average milliseconds per lookup");

            TermEnum e = reader.Terms(new Term("word", "azz"));

            System.Console.Out.WriteLine("Word after azz is " + e.Term().text);

            reader.Close();

            store.Close();
        }
        /* Walk through all unique text tokens (Posting
         * instances) found in this field and serialize them
         * into a single RAM segment. */
        void AppendPostings(DocumentsWriter.FlushState flushState,
                            FreqProxTermsWriterPerField[] fields,
                            TermInfosWriter termsOut,
                            IndexOutput freqOut,
                            IndexOutput proxOut,
                            DefaultSkipListWriter skipListWriter)
        {
            int fieldNumber = fields[0].fieldInfo.number;
            int numFields   = fields.Length;

            FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];

            for (int i = 0; i < numFields; i++)
            {
                FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);

                System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields[0].fieldInfo);

                // Should always be true
                bool result = fms.nextTerm();
                System.Diagnostics.Debug.Assert(result);
            }

            int  skipInterval       = termsOut.skipInterval;
            bool currentFieldOmitTf = fields[0].fieldInfo.omitTf;

            // If current field omits tf then it cannot store
            // payloads.  We silently drop the payloads in this case:
            bool currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads;

            FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];

            while (numFields > 0)
            {
                // Get the next term to merge
                termStates[0] = mergeStates[0];
                int numToMerge = 1;

                for (int i = 1; i < numFields; i++)
                {
                    char[] text       = mergeStates[i].text;
                    int    textOffset = mergeStates[i].textOffset;
                    int    cmp        = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);

                    if (cmp < 0)
                    {
                        termStates[0] = mergeStates[i];
                        numToMerge    = 1;
                    }
                    else if (cmp == 0)
                    {
                        termStates[numToMerge++] = mergeStates[i];
                    }
                }

                int df = 0;
                int lastPayloadLength = -1;

                int lastDoc = 0;

                char[] text_Renamed = termStates[0].text;
                int    start        = termStates[0].textOffset;

                long freqPointer = freqOut.GetFilePointer();
                long proxPointer;
                if (proxOut != null)
                {
                    proxPointer = proxOut.GetFilePointer();
                }
                else
                {
                    proxPointer = 0;
                }

                skipListWriter.ResetSkip();

                // Now termStates has numToMerge FieldMergeStates
                // which all share the same term.  Now we must
                // interleave the docID streams.
                while (numToMerge > 0)
                {
                    if ((++df % skipInterval) == 0)
                    {
                        skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);
                        skipListWriter.BufferSkip(df);
                    }

                    FreqProxFieldMergeState minState = termStates[0];
                    for (int i = 1; i < numToMerge; i++)
                    {
                        if (termStates[i].docID < minState.docID)
                        {
                            minState = termStates[i];
                        }
                    }

                    int doc         = minState.docID;
                    int termDocFreq = minState.termFreq;

                    System.Diagnostics.Debug.Assert(doc < flushState.numDocsInRAM);
                    System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1);

                    ByteSliceReader prox = minState.prox;

                    // Carefully copy over the prox + payload info,
                    // changing the format to match Lucene's segment
                    // format.
                    if (!currentFieldOmitTf)
                    {
                        // omitTf == false so we do write positions & payload
                        System.Diagnostics.Debug.Assert(proxOut != null);
                        for (int j = 0; j < termDocFreq; j++)
                        {
                            int code = prox.ReadVInt();
                            if (currentFieldStorePayloads)
                            {
                                int payloadLength;
                                if ((code & 1) != 0)
                                {
                                    // This position has a payload
                                    payloadLength = prox.ReadVInt();
                                }
                                else
                                {
                                    payloadLength = 0;
                                }
                                if (payloadLength != lastPayloadLength)
                                {
                                    proxOut.WriteVInt(code | 1);
                                    proxOut.WriteVInt(payloadLength);
                                    lastPayloadLength = payloadLength;
                                }
                                else
                                {
                                    proxOut.WriteVInt(code & (~1));
                                }
                                if (payloadLength > 0)
                                {
                                    copyBytes(prox, proxOut, payloadLength);
                                }
                            }
                            else
                            {
                                System.Diagnostics.Debug.Assert(0 == (code & 1));
                                proxOut.WriteVInt(code >> 1);
                            }
                        } //End for

                        int newDocCode = (doc - lastDoc) << 1;

                        if (1 == termDocFreq)
                        {
                            freqOut.WriteVInt(newDocCode | 1);
                        }
                        else
                        {
                            freqOut.WriteVInt(newDocCode);
                            freqOut.WriteVInt(termDocFreq);
                        }
                    }
                    else
                    {
                        // omitTf==true: we store only the docs, without
                        // term freq, positions, payloads
                        freqOut.WriteVInt(doc - lastDoc);
                    }

                    lastDoc = doc;

                    if (!minState.nextDoc())
                    {
                        // Remove from termStates
                        int upto = 0;
                        for (int i = 0; i < numToMerge; i++)
                        {
                            if (termStates[i] != minState)
                            {
                                termStates[upto++] = termStates[i];
                            }
                        }
                        numToMerge--;
                        System.Diagnostics.Debug.Assert(upto == numToMerge);

                        // Advance this state to the next term

                        if (!minState.nextTerm())
                        {
                            // OK, no more terms, so remove from mergeStates
                            // as well
                            upto = 0;
                            for (int i = 0; i < numFields; i++)
                            {
                                if (mergeStates[i] != minState)
                                {
                                    mergeStates[upto++] = mergeStates[i];
                                }
                            }
                            numFields--;
                            System.Diagnostics.Debug.Assert(upto == numFields);
                        }
                    }
                }

                System.Diagnostics.Debug.Assert(df > 0);

                // Done merging this term

                long skipPointer = skipListWriter.WriteSkip(freqOut);

                // Write term
                termInfo.Set(df, freqPointer, proxPointer, (int)(skipPointer - freqPointer));

                // TODO: we could do this incrementally
                UnicodeUtil.UTF16toUTF8(text_Renamed, start, termsUTF8);

                // TODO: we could save O(n) re-scan of the term by
                // computing the shared prefix with the last term
                // while during the UTF8 encoding
                termsOut.Add(fieldNumber,
                             termsUTF8.result,
                             termsUTF8.length,
                             termInfo);
            }
        }
Esempio n. 16
0
        private void  WritePostings(Posting[] postings, System.String segment)
        {
            IndexOutput       freq = null, prox = null;
            TermInfosWriter   tis              = null;
            TermVectorsWriter termVectorWriter = null;

            try
            {
                //open files for inverse index storage
                freq = directory.CreateOutput(segment + ".frq");
                prox = directory.CreateOutput(segment + ".prx");
                tis  = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
                TermInfo      ti           = new TermInfo();
                System.String currentField = null;

                for (int i = 0; i < postings.Length; i++)
                {
                    Posting posting = postings[i];

                    // add an entry to the dictionary with pointers to prox and freq files
                    ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), -1);
                    tis.Add(posting.term, ti);

                    // add an entry to the freq file
                    int postingFreq = posting.freq;
                    if (postingFreq == 1)
                    {
                        // optimize freq=1
                        freq.WriteVInt(1);
                    }
                    // set low bit of doc num.
                    else
                    {
                        freq.WriteVInt(0);                         // the document number
                        freq.WriteVInt(postingFreq);               // frequency in doc
                    }

                    int   lastPosition = 0;                   // write positions
                    int[] positions    = posting.positions;
                    for (int j = 0; j < postingFreq; j++)
                    {
                        // use delta-encoding
                        int position = positions[j];
                        prox.WriteVInt(position - lastPosition);
                        lastPosition = position;
                    }
                    // check to see if we switched to a new field
                    System.String termField = posting.term.Field();
                    if (currentField != termField)
                    {
                        // changing field - see if there is something to save
                        currentField = termField;
                        FieldInfo fi = fieldInfos.FieldInfo(currentField);
                        if (fi.storeTermVector)
                        {
                            if (termVectorWriter == null)
                            {
                                termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
                                termVectorWriter.OpenDocument();
                            }
                            termVectorWriter.OpenField(currentField);
                        }
                        else if (termVectorWriter != null)
                        {
                            termVectorWriter.CloseField();
                        }
                    }
                    if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
                    {
                        termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets);
                    }
                }
                if (termVectorWriter != null)
                {
                    termVectorWriter.CloseDocument();
                }
            }
            finally
            {
                // make an effort to close all streams we can but remember and re-throw
                // the first exception encountered in this process
                System.IO.IOException keep = null;
                if (freq != null)
                {
                    try
                    {
                        freq.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                        if (keep == null)
                        {
                            keep = e;
                        }
                    }
                }
                if (prox != null)
                {
                    try
                    {
                        prox.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                        if (keep == null)
                        {
                            keep = e;
                        }
                    }
                }
                if (tis != null)
                {
                    try
                    {
                        tis.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                        if (keep == null)
                        {
                            keep = e;
                        }
                    }
                }
                if (termVectorWriter != null)
                {
                    try
                    {
                        termVectorWriter.Close();
                    }
                    catch (System.IO.IOException e)
                    {
                        if (keep == null)
                        {
                            keep = e;
                        }
                    }
                }
                if (keep != null)
                {
                    throw new System.IO.IOException(keep.StackTrace);
                }
            }
        }
Esempio n. 17
0
		/* Walk through all unique text tokens (Posting
		* instances) found in this field and serialize them
		* into a single RAM segment. */
		internal void  AppendPostings(ThreadState.FieldData[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut)
		{
			
			int fieldNumber = fields[0].fieldInfo.number;
			int numFields = fields.Length;
			
			FieldMergeState[] mergeStates = new FieldMergeState[numFields];
			
			for (int i = 0; i < numFields; i++)
			{
				FieldMergeState fms = mergeStates[i] = new FieldMergeState();
				fms.field = fields[i];
				fms.postings = fms.field.SortPostings();
				
				System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields [0].fieldInfo);
				
				// Should always be true
				bool result = fms.NextTerm();
				System.Diagnostics.Debug.Assert(result);
			}
			
			int skipInterval = termsOut.skipInterval;
			currentFieldStorePayloads = fields[0].fieldInfo.storePayloads;
			
			FieldMergeState[] termStates = new FieldMergeState[numFields];
			
			while (numFields > 0)
			{
				
				// Get the next term to merge
				termStates[0] = mergeStates[0];
				int numToMerge = 1;
				
				for (int i = 1; i < numFields; i++)
				{
					char[] text = mergeStates[i].text;
					int textOffset = mergeStates[i].textOffset;
					int cmp = CompareText(text, textOffset, termStates[0].text, termStates[0].textOffset);
					
					if (cmp < 0)
					{
						termStates[0] = mergeStates[i];
						numToMerge = 1;
					}
					else if (cmp == 0)
						termStates[numToMerge++] = mergeStates[i];
				}
				
				int df = 0;
				int lastPayloadLength = - 1;
				
				int lastDoc = 0;
				
				char[] text2 = termStates[0].text;
				int start = termStates[0].textOffset;
				int pos = start;
				while (text2[pos] != 0xffff)
					pos++;
				
				long freqPointer = freqOut.GetFilePointer();
				long proxPointer = proxOut.GetFilePointer();
				
				skipListWriter.ResetSkip();
				
				// Now termStates has numToMerge FieldMergeStates
				// which all share the same term.  Now we must
				// interleave the docID streams.
				while (numToMerge > 0)
				{
					
					if ((++df % skipInterval) == 0)
					{
						skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);
						skipListWriter.BufferSkip(df);
					}
					
					FieldMergeState minState = termStates[0];
					for (int i = 1; i < numToMerge; i++)
						if (termStates[i].docID < minState.docID)
							minState = termStates[i];
					
					int doc = minState.docID;
					int termDocFreq = minState.termFreq;
					
					System.Diagnostics.Debug.Assert(doc < numDocsInRAM);
					System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1);
					
					int newDocCode = (doc - lastDoc) << 1;
					lastDoc = doc;
					
					ByteSliceReader prox = minState.prox;
					
					// Carefully copy over the prox + payload info,
					// changing the format to match Lucene's segment
					// format.
					for (int j = 0; j < termDocFreq; j++)
					{
						int code = prox.ReadVInt();
						if (currentFieldStorePayloads)
						{
							int payloadLength;
							if ((code & 1) != 0)
							{
								// This position has a payload
								payloadLength = prox.ReadVInt();
							}
							else
								payloadLength = 0;
							if (payloadLength != lastPayloadLength)
							{
								proxOut.WriteVInt(code | 1);
								proxOut.WriteVInt(payloadLength);
								lastPayloadLength = payloadLength;
							}
							else
								proxOut.WriteVInt(code & (~ 1));
							if (payloadLength > 0)
								CopyBytes(prox, proxOut, payloadLength);
						}
						else
						{
							System.Diagnostics.Debug.Assert(0 ==(code & 1));
							proxOut.WriteVInt(code >> 1);
						}
					}
					
					if (1 == termDocFreq)
					{
						freqOut.WriteVInt(newDocCode | 1);
					}
					else
					{
						freqOut.WriteVInt(newDocCode);
						freqOut.WriteVInt(termDocFreq);
					}
					
					if (!minState.NextDoc())
					{
						
						// Remove from termStates
						int upto = 0;
						for (int i = 0; i < numToMerge; i++)
							if (termStates[i] != minState)
								termStates[upto++] = termStates[i];
						numToMerge--;
						System.Diagnostics.Debug.Assert(upto == numToMerge);
						
						// Advance this state to the next term
						
						if (!minState.NextTerm())
						{
							// OK, no more terms, so remove from mergeStates
							// as well
							upto = 0;
							for (int i = 0; i < numFields; i++)
								if (mergeStates[i] != minState)
									mergeStates[upto++] = mergeStates[i];
							numFields--;
							System.Diagnostics.Debug.Assert(upto == numFields);
						}
					}
				}
				
				System.Diagnostics.Debug.Assert(df > 0);
				
				// Done merging this term
				
				long skipPointer = skipListWriter.WriteSkip(freqOut);
				
				// Write term
				termInfo.Set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
				termsOut.Add(fieldNumber, text2, start, pos - start, termInfo);
			}
		}
Esempio n. 18
0
		/// <summary>Creates a segment from all Postings in the Postings
		/// hashes across all ThreadStates & FieldDatas. 
		/// </summary>
		private System.Collections.IList WriteSegment()
		{
			
			System.Diagnostics.Debug.Assert(AllThreadsIdle());
			
			System.Diagnostics.Debug.Assert(nextDocID == numDocsInRAM);
			
			System.String segmentName;
			
			segmentName = segment;
			
			TermInfosWriter termsOut = new TermInfosWriter(directory, segmentName, fieldInfos, writer.GetTermIndexInterval());
			
			IndexOutput freqOut = directory.CreateOutput(segmentName + ".frq");
			IndexOutput proxOut = directory.CreateOutput(segmentName + ".prx");
			
			// Gather all FieldData's that have postings, across all
			// ThreadStates
			System.Collections.ArrayList allFields = new System.Collections.ArrayList();
			System.Diagnostics.Debug.Assert(AllThreadsIdle());
			for (int i = 0; i < threadStates.Length; i++)
			{
				ThreadState state = threadStates[i];
				state.TrimFields();
				int numFields = state.numAllFieldData;
				for (int j = 0; j < numFields; j++)
				{
					ThreadState.FieldData fp = state.allFieldDataArray[j];
					if (fp.numPostings > 0)
						allFields.Add(fp);
				}
			}
			
			// Sort by field name
			allFields.Sort();
			int numAllFields = allFields.Count;
			
			skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, numDocsInRAM, freqOut, proxOut);
			
			int start = 0;
			while (start < numAllFields)
			{
				
				System.String fieldName = ((ThreadState.FieldData) allFields[start]).fieldInfo.name;
				
				int end = start + 1;
				while (end < numAllFields && ((ThreadState.FieldData) allFields[end]).fieldInfo.name.Equals(fieldName))
					end++;
				
				ThreadState.FieldData[] fields = new ThreadState.FieldData[end - start];
				for (int i = start; i < end; i++)
					fields[i - start] = (ThreadState.FieldData) allFields[i];
				
				// If this field has postings then add them to the
				// segment
				AppendPostings(fields, termsOut, freqOut, proxOut);
				
				for (int i = 0; i < fields.Length; i++)
					fields[i].ResetPostingArrays();
				
				start = end;
			}
			
			freqOut.Close();
			proxOut.Close();
			termsOut.Close();
			
			// Record all files we have flushed
			System.Collections.IList flushedFiles = new System.Collections.ArrayList();
			flushedFiles.Add(SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION));
			flushedFiles.Add(SegmentFileName(IndexFileNames.FREQ_EXTENSION));
			flushedFiles.Add(SegmentFileName(IndexFileNames.PROX_EXTENSION));
			flushedFiles.Add(SegmentFileName(IndexFileNames.TERMS_EXTENSION));
			flushedFiles.Add(SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION));
			
			if (hasNorms)
			{
				WriteNorms(segmentName, numDocsInRAM);
				flushedFiles.Add(SegmentFileName(IndexFileNames.NORMS_EXTENSION));
			}
			
			if (infoStream != null)
			{
				long newSegmentSize = SegmentSize(segmentName);
				System.String message = String.Format(nf, "  oldRAMSize={0:d} newFlushedSize={1:d} docs/MB={2:f} new/old={3:%}",
					new Object[] { numBytesUsed, newSegmentSize, (numDocsInRAM / (newSegmentSize / 1024.0 / 1024.0)), (newSegmentSize / numBytesUsed) });
				infoStream.WriteLine(message);
			}
			
			ResetPostingsData();
			
			nextDocID = 0;
			nextWriteDocID = 0;
			numDocsInRAM = 0;
			files = null;
			
			// Maybe downsize postingsFreeList array
			if (postingsFreeList.Length > 1.5 * postingsFreeCount)
			{
				int newSize = postingsFreeList.Length;
				while (newSize > 1.25 * postingsFreeCount)
				{
					newSize = (int) (newSize * 0.8);
				}
				Posting[] newArray = new Posting[newSize];
				Array.Copy(postingsFreeList, 0, newArray, 0, postingsFreeCount);
				postingsFreeList = newArray;
			}
			
			return flushedFiles;
		}
Esempio n. 19
0
        private void MergeTerms()
        {
            try
            {
                freqOutput = directory.CreateOutput(segment + ".frq");
                if (HasProx())
                    proxOutput = directory.CreateOutput(segment + ".prx");
                termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
                skipInterval = termInfosWriter.skipInterval;
                maxSkipLevels = termInfosWriter.maxSkipLevels;
                skipListWriter = new DefaultSkipListWriter(skipInterval, maxSkipLevels, mergedDocs, freqOutput, proxOutput);
                queue = new SegmentMergeQueue(readers.Count);

                MergeTermInfos();
            }
            finally
            {
                if (freqOutput != null)
                    freqOutput.Close();
                if (proxOutput != null)
                    proxOutput.Close();
                if (termInfosWriter != null)
                    termInfosWriter.Close();
                if (queue != null)
                    queue.Close();
            }
        }
Esempio n. 20
0
		// FIXME: OG: remove hard-coded file names
		public static void  Test()
		{
			
			System.IO.FileInfo file = new System.IO.FileInfo("words.txt");
			System.Console.Out.WriteLine(" reading word file containing " + file.Length + " bytes");
			
			System.DateTime start = System.DateTime.Now;
			
			System.Collections.ArrayList keys = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
			System.IO.FileStream ws = new System.IO.FileStream(file.FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read);
			System.IO.StreamReader wr = new System.IO.StreamReader(new System.IO.StreamReader(ws, System.Text.Encoding.Default).BaseStream, new System.IO.StreamReader(ws, System.Text.Encoding.Default).CurrentEncoding);
			
			for (System.String key = wr.ReadLine(); key != null; key = wr.ReadLine())
				keys.Add(new Term("word", key));
			wr.Close();
			
			System.DateTime end = System.DateTime.Now;
			
			System.Console.Out.Write(end.Ticks - start.Ticks);
			System.Console.Out.WriteLine(" milliseconds to read " + keys.Count + " words");
			
			start = System.DateTime.Now;
			
			System.Random gen = new System.Random((System.Int32) 1251971);
			long fp = (gen.Next() & 0xF) + 1;
			long pp = (gen.Next() & 0xF) + 1;
			int[] docFreqs = new int[keys.Count];
			long[] freqPointers = new long[keys.Count];
			long[] proxPointers = new long[keys.Count];
			for (int i = 0; i < keys.Count; i++)
			{
				docFreqs[i] = (gen.Next() & 0xF) + 1;
				freqPointers[i] = fp;
				proxPointers[i] = pp;
				fp += (gen.Next() & 0xF) + 1;
				;
				pp += (gen.Next() & 0xF) + 1;
				;
			}
			
			end = System.DateTime.Now;
			
			System.Console.Out.Write(end.Ticks - start.Ticks);
			System.Console.Out.WriteLine(" milliseconds to generate values");
			
			start = System.DateTime.Now;
			
			Directory store = FSDirectory.GetDirectory("test.store", true);
			FieldInfos fis = new FieldInfos();
			
			TermInfosWriter writer = new TermInfosWriter(store, "words", fis);
			fis.Add("word", false);
			
			for (int i = 0; i < keys.Count; i++)
				writer.Add((Term) keys[i], new TermInfo(docFreqs[i], freqPointers[i], proxPointers[i]));
			
			writer.Close();
			
			end = System.DateTime.Now;
			
			System.Console.Out.Write(end.Ticks - start.Ticks);
			System.Console.Out.WriteLine(" milliseconds to write table");
			
			System.Console.Out.WriteLine(" table occupies " + store.FileLength("words.tis") + " bytes");
			
			start = System.DateTime.Now;
			
			TermInfosReader reader = new TermInfosReader(store, "words", fis);
			
			end = System.DateTime.Now;
			
			System.Console.Out.Write(end.Ticks - start.Ticks);
			System.Console.Out.WriteLine(" milliseconds to open table");
			
			start = System.DateTime.Now;
			
			SegmentTermEnum enumerator = reader.Terms();
			for (int i = 0; i < keys.Count; i++)
			{
				enumerator.Next();
				Term key = (Term) keys[i];
				if (!key.Equals(enumerator.Term()))
				{
					throw new System.Exception("wrong term: " + enumerator.Term() + ", expected: " + key + " at " + i);
				}
				TermInfo ti = enumerator.TermInfo();
				if (ti.docFreq != docFreqs[i])
					throw new System.Exception("wrong value: " + System.Convert.ToString(ti.docFreq, 16) + ", expected: " + System.Convert.ToString(docFreqs[i], 16) + " at " + i);
				if (ti.freqPointer != freqPointers[i])
					throw new System.Exception("wrong value: " + System.Convert.ToString(ti.freqPointer, 16) + ", expected: " + System.Convert.ToString(freqPointers[i], 16) + " at " + i);
				if (ti.proxPointer != proxPointers[i])
					throw new System.Exception("wrong value: " + System.Convert.ToString(ti.proxPointer, 16) + ", expected: " + System.Convert.ToString(proxPointers[i], 16) + " at " + i);
			}
			
			end = System.DateTime.Now;
			
			System.Console.Out.Write(end.Ticks - start.Ticks);
			System.Console.Out.WriteLine(" milliseconds to iterate over " + keys.Count + " words");
			
			start = System.DateTime.Now;
			
			for (int i = 0; i < keys.Count; i++)
			{
				Term key = (Term) keys[i];
				TermInfo ti = reader.Get(key);
				if (ti.docFreq != docFreqs[i])
					throw new System.Exception("wrong value: " + System.Convert.ToString(ti.docFreq, 16) + ", expected: " + System.Convert.ToString(docFreqs[i], 16) + " at " + i);
				if (ti.freqPointer != freqPointers[i])
					throw new System.Exception("wrong value: " + System.Convert.ToString(ti.freqPointer, 16) + ", expected: " + System.Convert.ToString(freqPointers[i], 16) + " at " + i);
				if (ti.proxPointer != proxPointers[i])
					throw new System.Exception("wrong value: " + System.Convert.ToString(ti.proxPointer, 16) + ", expected: " + System.Convert.ToString(proxPointers[i], 16) + " at " + i);
			}
			
			end = System.DateTime.Now;
			
			System.Console.Out.Write((end.Ticks - start.Ticks) / (float) keys.Count);
			System.Console.Out.WriteLine(" average milliseconds per lookup");
			
			TermEnum e = reader.Terms(new Term("word", "azz"));
			System.Console.Out.WriteLine("Word after azz is " + e.Term().text);
			
			reader.Close();
			
			store.Close();
		}