DataOutput backed by a byte array. WARNING: this class omits most low-level checks, so be sure to test heavily with assertions enabled. @lucene.experimental
Inheritance: DataOutput
        public override void Build(IInputIterator iterator)
        {
            if (iterator.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (iterator.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir());
            FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir());

            OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
            OfflineSorter.ByteSequencesReader reader = null;
            ExternalRefSorter sorter = null;

            // Push floats up front before sequences to sort them. For now, assume they are non-negative.
            // If negative floats are allowed some trickery needs to be done to find their byte order.
            bool success = false;
            count = 0;
            try
            {
                byte[] buffer = new byte[0];
                ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
                BytesRef spare;
                while ((spare = iterator.Next()) != null)
                {
                    if (spare.Length + 4 >= buffer.Length)
                    {
                        buffer = ArrayUtil.Grow(buffer, spare.Length + 4);
                    }

                    output.Reset(buffer);
                    output.WriteInt(EncodeWeight(iterator.Weight));
                    output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
                    writer.Write(buffer, 0, output.Position);
                }
                writer.Dispose();

                // We don't know the distribution of scores and we need to bucket them, so we'll sort
                // and divide into equal buckets.
                OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted);
                tempInput.Delete();
                FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength);

                int inputLines = info.Lines;
                reader = new OfflineSorter.ByteSequencesReader(tempSorted);
                long line = 0;
                int previousBucket = 0;
                int previousScore = 0;
                ByteArrayDataInput input = new ByteArrayDataInput();
                BytesRef tmp1 = new BytesRef();
                BytesRef tmp2 = new BytesRef();
                while (reader.Read(tmp1))
                {
                    input.Reset(tmp1.Bytes);
                    int currentScore = input.ReadInt();

                    int bucket;
                    if (line > 0 && currentScore == previousScore)
                    {
                        bucket = previousBucket;
                    }
                    else
                    {
                        bucket = (int)(line * buckets / inputLines);
                    }
                    previousScore = currentScore;
                    previousBucket = bucket;

                    // Only append the input, discard the weight.
                    tmp2.Bytes = tmp1.Bytes;
                    tmp2.Offset = input.Position;
                    tmp2.Length = tmp1.Length - input.Position;
                    builder.Add(tmp2, bucket);

                    line++;
                    count++;
                }

                // The two FSTCompletions share the same automaton.
                this.higherWeightsCompletion = builder.Build();
                this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Close(reader, writer, sorter);
                }
                else
                {
                    IOUtils.CloseWhileHandlingException(reader, writer, sorter);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
示例#2
0
		/// <summary>
		/// Builds an <seealso cref="SynonymMap"/> and returns it.
		/// </summary>
		public virtual SynonymMap Build()
		{
		  ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
		  // TODO: are we using the best sharing options?
		  var builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);

		  BytesRef scratch = new BytesRef(64);
		  ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

		  HashSet<int?> dedupSet;

		  if (dedup)
		  {
			dedupSet = new HashSet<int?>();
		  }
		  else
		  {
			dedupSet = null;
		  }

		  
            var spare = new sbyte[5];

		  Dictionary<CharsRef, MapEntry>.KeyCollection keys = workingSet.Keys;
		  CharsRef[] sortedKeys = keys.ToArray();
		  Arrays.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator);

		  IntsRef scratchIntsRef = new IntsRef();

		  //System.out.println("fmap.build");
		  for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++)
		  {
			CharsRef input = sortedKeys[keyIdx];
			MapEntry output = workingSet[input];

			int numEntries = output.ords.Count;
			// output size, assume the worst case
			int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

			scratch.Grow(estimatedSize);
			scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length);
			Debug.Assert(scratch.Offset == 0);

			// now write our output data:
			int count = 0;
			for (int i = 0; i < numEntries; i++)
			{
			  if (dedupSet != null)
			  {
				// box once
				int? ent = output.ords[i];
				if (dedupSet.Contains(ent))
				{
				  continue;
				}
				dedupSet.Add(ent);
			  }
			  scratchOutput.WriteVInt(output.ords[i]);
			  count++;
			}

			int pos = scratchOutput.Position;
			scratchOutput.WriteVInt(count << 1 | (output.includeOrig ? 0 : 1));
			int pos2 = scratchOutput.Position;
			int vIntLen = pos2 - pos;

			// Move the count + includeOrig to the front of the byte[]:
			Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen);
			Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos);
			Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen);

			if (dedupSet != null)
			{
			  dedupSet.Clear();
			}

			scratch.Length = scratchOutput.Position - scratch.Offset;
			//System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
			builder.Add(Util.ToUTF32(input, scratchIntsRef), BytesRef.DeepCopyOf(scratch));
		  }

		  FST<BytesRef> fst = builder.Finish();
		  return new SynonymMap(fst, words, maxHorizontalContext);
		}