/// Convenience constant for megabytes /// Convenience constant for gigabytes /// Minimum recommended buffer size for sorting. /// /// Absolute minimum required buffer size for sorting. /// /// Maximum number of temporary files before doing an intermediate merge. /// /// A bit more descriptive unit for constructors. /// /// Creates a BufferSize in MB. The given /// values must be > 0 and < 2048. /// /// Approximately half of the currently available free heap, but no less /// than #ABSOLUTE_MIN_SORT_BUFFER_SIZE. However if current heap allocation /// is insufficient or if there is a large portion of unallocated heap-space available /// for sorting consult with max allowed heap size. /// /// Sort info (debugging mostly). /// /// number of temporary files created when merging partitions /// number of partition merges /// number of lines of data read /// time spent merging sorted partitions (in milliseconds) /// time spent sorting data (in milliseconds) /// total time spent (in milliseconds) /// time spent in i/o read (in milliseconds) /// read buffer size (in bytes) /// create a new SortInfo (with empty statistics) for debugging /// Default comparator: sorts in binary (codepoint) order /// Defaults constructor. /// /// Defaults constructor with a custom comparator. /// /// All-details constructor. /// /// Sort input to output, explicit hint for the buffer size. The amount of allocated /// memory may deviate from the hint (may be smaller or larger). /// /// Returns the default temporary directory. By default, the System's temp folder. If not accessible /// or not available, an IOException is thrown /// /// Copies one file to another. /// /// Sort a single partition in-memory. /// Merge a list of sorted temporary files (partitions) into an output file /// Read in a single partition of data Utility class to emit length-prefixed byte[] entries to an output stream for sorting. Complementary to ByteSequencesReader.
Inheritance: IDisposable
Esempio n. 1
0
        private FileInfo WriteAll(string name, byte[][] data)
        {
            FileInfo file = new FileInfo(Path.Combine(tempDir.FullName, name));

            using (file.Create()) { }
            OfflineSorter.ByteSequencesWriter w = new OfflineSorter.ByteSequencesWriter(file);
            foreach (byte[] datum in data)
            {
                w.Write(datum);
            }
            w.Dispose();
            return(file);
        }
Esempio n. 2
0
 private void CloseWriter()
 {
     if (writer != null)
     {
         writer.Dispose();
         writer = null;
     }
 }
        public override void Build(IInputIterator iterator)
        {
            if (iterator.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (iterator.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir());
            FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir());

            OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
            OfflineSorter.ByteSequencesReader reader = null;
            ExternalRefSorter sorter = null;

            // Push floats up front before sequences to sort them. For now, assume they are non-negative.
            // If negative floats are allowed some trickery needs to be done to find their byte order.
            bool success = false;
            count = 0;
            try
            {
                byte[] buffer = new byte[0];
                ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
                BytesRef spare;
                while ((spare = iterator.Next()) != null)
                {
                    if (spare.Length + 4 >= buffer.Length)
                    {
                        buffer = ArrayUtil.Grow(buffer, spare.Length + 4);
                    }

                    output.Reset(buffer);
                    output.WriteInt(EncodeWeight(iterator.Weight));
                    output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
                    writer.Write(buffer, 0, output.Position);
                }
                writer.Dispose();

                // We don't know the distribution of scores and we need to bucket them, so we'll sort
                // and divide into equal buckets.
                OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted);
                tempInput.Delete();
                FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength);

                int inputLines = info.Lines;
                reader = new OfflineSorter.ByteSequencesReader(tempSorted);
                long line = 0;
                int previousBucket = 0;
                int previousScore = 0;
                ByteArrayDataInput input = new ByteArrayDataInput();
                BytesRef tmp1 = new BytesRef();
                BytesRef tmp2 = new BytesRef();
                while (reader.Read(tmp1))
                {
                    input.Reset(tmp1.Bytes);
                    int currentScore = input.ReadInt();

                    int bucket;
                    if (line > 0 && currentScore == previousScore)
                    {
                        bucket = previousBucket;
                    }
                    else
                    {
                        bucket = (int)(line * buckets / inputLines);
                    }
                    previousScore = currentScore;
                    previousBucket = bucket;

                    // Only append the input, discard the weight.
                    tmp2.Bytes = tmp1.Bytes;
                    tmp2.Offset = input.Position;
                    tmp2.Length = tmp1.Length - input.Position;
                    builder.Add(tmp2, bucket);

                    line++;
                    count++;
                }

                // The two FSTCompletions share the same automaton.
                this.higherWeightsCompletion = builder.Build();
                this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Close(reader, writer, sorter);
                }
                else
                {
                    IOUtils.CloseWhileHandlingException(reader, writer, sorter);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
Esempio n. 4
0
 /// <summary>
 /// Will buffer all sequences to a temporary file and then sort (all on-disk).
 /// </summary>
 public ExternalRefSorter(OfflineSorter sort)
 {
     this.sort = sort;
     this.input = new FileInfo(Path.GetTempFileName());
     this.writer = new OfflineSorter.ByteSequencesWriter(input);
 }