public override void Build(IInputIterator iterator) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir()); FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir()); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. bool success = false; count = 0; try { byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = iterator.Next()) != null) { if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteInt(EncodeWeight(iterator.Weight)); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); writer.Write(buffer, 0, output.Position); } writer.Dispose(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted); tempInput.Delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); int inputLines = info.Lines; reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.Read(tmp1)) { input.Reset(tmp1.Bytes); int currentScore = input.ReadInt(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int)(line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.Bytes = tmp1.Bytes; tmp2.Offset = input.Position; tmp2.Length = tmp1.Length - input.Position; builder.Add(tmp2, bucket); line++; count++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.Build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst); success = true; } finally { if (success) { IOUtils.Close(reader, writer, sorter); } else { IOUtils.CloseWhileHandlingException(reader, writer, sorter); } tempInput.Delete(); tempSorted.Delete(); } }
/// <summary> /// Builds an <seealso cref="SynonymMap"/> and returns it. /// </summary> public virtual SynonymMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; // TODO: are we using the best sharing options? var builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); HashSet<int?> dedupSet; if (dedup) { dedupSet = new HashSet<int?>(); } else { dedupSet = null; } var spare = new sbyte[5]; Dictionary<CharsRef, MapEntry>.KeyCollection keys = workingSet.Keys; CharsRef[] sortedKeys = keys.ToArray(); Arrays.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator); IntsRef scratchIntsRef = new IntsRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet[input]; int numEntries = output.ords.Count; // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.Grow(estimatedSize); scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length); Debug.Assert(scratch.Offset == 0); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once int? ent = output.ords[i]; if (dedupSet.Contains(ent)) { continue; } dedupSet.Add(ent); } scratchOutput.WriteVInt(output.ords[i]); count++; } int pos = scratchOutput.Position; scratchOutput.WriteVInt(count << 1 | (output.includeOrig ? 0 : 1)); int pos2 = scratchOutput.Position; int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen); Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos); Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen); if (dedupSet != null) { dedupSet.Clear(); } scratch.Length = scratchOutput.Position - scratch.Offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); builder.Add(Util.ToUTF32(input, scratchIntsRef), BytesRef.DeepCopyOf(scratch)); } FST<BytesRef> fst = builder.Finish(); return new SynonymMap(fst, words, maxHorizontalContext); }