示例#1
0
        public override void BeforeClass()
        {
            base.BeforeClass();

            Random random = Random;

            INTS              = new int[COUNT];
            LONGS             = new long[COUNT];
            RANDOM_TEST_BYTES = new byte[COUNT * (5 + 4 + 9 + 8)];
            ByteArrayDataOutput bdo = new ByteArrayDataOutput(RANDOM_TEST_BYTES);

            for (int i = 0; i < COUNT; i++)
            {
                int i1 = INTS[i] = random.Next();
                bdo.WriteVInt32(i1);
                bdo.WriteInt32(i1);

                long l1;
                if (Rarely())
                {
                    // a long with lots of zeroes at the end
                    l1 = LONGS[i] = TestUtil.NextInt64(random, 0, int.MaxValue) << 32;
                }
                else
                {
                    l1 = LONGS[i] = TestUtil.NextInt64(random, 0, long.MaxValue);
                }
                bdo.WriteVInt64(l1);
                bdo.WriteInt64(l1);
            }
        }
 internal static byte[] Compress(Compressor compressor, byte[] decompressed, int off, int len)
 {
     var compressed = new byte[len * 2 + 16]; // should be enough
     ByteArrayDataOutput @out = new ByteArrayDataOutput(compressed);
     compressor.Compress(decompressed, off, len, @out);
     int compressedLen = @out.Position;
     return Arrays.CopyOf(compressed, compressedLen);
 }
示例#3
0
        internal static sbyte[] Compress(Compressor compressor, sbyte[] decompressed, int off, int len)
        {
            sbyte[]             compressed = new sbyte[len * 2 + 16]; // should be enough
            ByteArrayDataOutput @out       = new ByteArrayDataOutput((byte[])(Array)compressed);

            compressor.Compress(decompressed, off, len, @out);
            int compressedLen = @out.Position;

            return(Arrays.CopyOf(compressed, compressedLen));
        }
示例#4
0
 /// <summary>
 /// encodes an entry (bytes+weight) to the provided writer
 /// </summary>
 protected internal virtual void Encode(OfflineSorter.ByteSequencesWriter writer,
                                        ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight)
 {
     if (spare.Length + 8 >= buffer.Length)
     {
         buffer = ArrayUtil.Grow(buffer, spare.Length + 8);
     }
     output.Reset(buffer);
     output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
     output.WriteInt64(weight);
     writer.Write(buffer, 0, output.Position);
 }
示例#5
0
        private OfflineSorter.ByteSequencesReader Sort()
        {
            string        prefix    = this.GetType().Name;
            DirectoryInfo directory = OfflineSorter.DefaultTempDir();

            tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            var  writer  = new OfflineSorter.ByteSequencesWriter(tempInput);
            bool success = false;

            try
            {
                BytesRef spare;
                byte[]   buffer = new byte[0];
                var      output = new ByteArrayDataOutput(buffer);

                while ((spare = source.Next()) != null)
                {
                    Encode(writer, output, buffer, spare, source.Payload, source.Contexts, source.Weight);
                }
                writer.Dispose();
                (new OfflineSorter(tieBreakByCostComparer)).Sort(tempInput, tempSorted);
                var reader = new OfflineSorter.ByteSequencesReader(tempSorted);
                success = true;
                return(reader);
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(writer);
                }
                else
                {
                    try
                    {
                        IOUtils.DisposeWhileHandlingException(writer);
                    }
                    finally
                    {
                        Dispose();
                    }
                }
            }
        }
示例#6
0
        private OfflineSorter.ByteSequencesReader Sort()
        {
            string        prefix    = this.GetType().Name;
            DirectoryInfo directory = OfflineSorter.DefaultTempDir();

            tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            var  writer  = new OfflineSorter.ByteSequencesWriter(tempInput);
            bool success = false;

            try
            {
                byte[] buffer = Arrays.Empty <byte>();
                ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);

                while (source.MoveNext())
                {
                    Encode(writer, output, buffer, source.Current, source.Weight);
                }
                writer.Dispose();
                (new OfflineSorter(tieBreakByCostComparer)).Sort(tempInput, tempSorted);
                OfflineSorter.ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(tempSorted);
                success = true;
                return(reader);
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(writer);
                }
                else
                {
                    try
                    {
                        IOUtils.DisposeWhileHandlingException(writer);
                    }
                    finally
                    {
                        Close();
                    }
                }
            }
        }
示例#7
0
        /// <summary>
        /// encodes an entry (bytes+(contexts)+(payload)+weight) to the provided writer
        /// </summary>
        protected internal virtual void Encode(OfflineSorter.ByteSequencesWriter writer,
                                               ByteArrayDataOutput output, byte[] buffer, BytesRef spare, BytesRef payload,
                                               IEnumerable <BytesRef> contexts, long weight)
        {
            int requiredLength = spare.Length + 8 + ((hasPayloads) ? 2 + payload.Length : 0);

            if (hasContexts)
            {
                foreach (BytesRef ctx in contexts)
                {
                    requiredLength += 2 + ctx.Length;
                }
                requiredLength += 2; // for length of contexts
            }
            if (requiredLength >= buffer.Length)
            {
                buffer = ArrayUtil.Grow(buffer, requiredLength);
            }
            output.Reset(buffer);
            output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
            if (hasContexts)
            {
                foreach (BytesRef ctx in contexts)
                {
                    output.WriteBytes(ctx.Bytes, ctx.Offset, ctx.Length);
                    output.WriteInt16((short)ctx.Length);
                }
                output.WriteInt16((short)contexts.Count());
            }
            if (hasPayloads)
            {
                output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                output.WriteInt16((short)payload.Length);
            }
            output.WriteInt64(weight);
            writer.Write(buffer, 0, output.Position);
        }
示例#8
0
 protected internal override void Encode(OfflineSorter.ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, BytesRef payload, ICollection <BytesRef> contexts, long weight)
 {
     if (spare.Length + 4 >= buffer.Length)
     {
         buffer = ArrayUtil.Grow(buffer, spare.Length + 4);
     }
     output.Reset(buffer);
     output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
     output.WriteInt32(EncodeWeight(weight));
     writer.Write(buffer, 0, output.Position);
 }
示例#9
0
        public override void Build(IInputIterator iterator)
        {
            if (iterator.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (iterator.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            FileInfo tempInput  = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir());
            FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir());

            OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
            OfflineSorter.ByteSequencesReader reader = null;
            ExternalRefSorter sorter = null;

            // Push floats up front before sequences to sort them. For now, assume they are non-negative.
            // If negative floats are allowed some trickery needs to be done to find their byte order.
            bool success = false;

            count = 0;
            try
            {
                byte[] buffer = new byte[0];
                ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
                BytesRef            spare;
                while ((spare = iterator.Next()) != null)
                {
                    if (spare.Length + 4 >= buffer.Length)
                    {
                        buffer = ArrayUtil.Grow(buffer, spare.Length + 4);
                    }

                    output.Reset(buffer);
                    output.WriteInt32(EncodeWeight(iterator.Weight));
                    output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
                    writer.Write(buffer, 0, output.Position);
                }
                writer.Dispose();

                // We don't know the distribution of scores and we need to bucket them, so we'll sort
                // and divide into equal buckets.
                OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted);
                tempInput.Delete();
                FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength);

                int inputLines = info.Lines;
                reader = new OfflineSorter.ByteSequencesReader(tempSorted);
                long line                = 0;
                int  previousBucket      = 0;
                int  previousScore       = 0;
                ByteArrayDataInput input = new ByteArrayDataInput();
                BytesRef           tmp1  = new BytesRef();
                BytesRef           tmp2  = new BytesRef();
                while (reader.Read(tmp1))
                {
                    input.Reset(tmp1.Bytes);
                    int currentScore = input.ReadInt32();

                    int bucket;
                    if (line > 0 && currentScore == previousScore)
                    {
                        bucket = previousBucket;
                    }
                    else
                    {
                        bucket = (int)(line * buckets / inputLines);
                    }
                    previousScore  = currentScore;
                    previousBucket = bucket;

                    // Only append the input, discard the weight.
                    tmp2.Bytes  = tmp1.Bytes;
                    tmp2.Offset = input.Position;
                    tmp2.Length = tmp1.Length - input.Position;
                    builder.Add(tmp2, bucket);

                    line++;
                    count++;
                }

                // The two FSTCompletions share the same automaton.
                this.higherWeightsCompletion = builder.Build();
                this.normalCompletion        = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Close(reader, writer, sorter);
                }
                else
                {
                    IOUtils.CloseWhileHandlingException(reader, writer, sorter);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
示例#10
0
 public FixedLengthArcsBuffer()
 {
     this.bytes = new byte[11];
     this.bado  = new ByteArrayDataOutput(bytes);
 }
示例#11
0
        public virtual void TestVariableBinary([ValueSource(typeof(ConcurrentMergeSchedulerFactories), "Values")] Func <IConcurrentMergeScheduler> newScheduler)
        {
            BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BVariableBinary"));

            if (dir is MockDirectoryWrapper)
            {
                ((MockDirectoryWrapper)dir).Throttling = Throttling.NEVER;
            }

            var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))
                         .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
                         .SetRAMBufferSizeMB(256.0)
                         .SetMergeScheduler(newScheduler())
                         .SetMergePolicy(NewLogMergePolicy(false, 10))
                         .SetOpenMode(OpenMode.CREATE);
            IndexWriter w = new IndexWriter(dir, config);

            Document             doc     = new Document();
            var                  bytes   = new byte[4];
            ByteArrayDataOutput  encoder = new ByteArrayDataOutput(bytes);
            BytesRef             data    = new BytesRef(bytes);
            BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data);

            doc.Add(dvField);

            for (int i = 0; i < int.MaxValue; i++)
            {
                encoder.Reset(bytes);
                encoder.WriteVInt32(i % 65535); // 1, 2, or 3 bytes
                data.Length = encoder.Position;
                w.AddDocument(doc);
                if (i % 100000 == 0)
                {
                    Console.WriteLine("indexed: " + i);
                    Console.Out.Flush();
                }
            }

            w.ForceMerge(1);
            w.Dispose();

            Console.WriteLine("verifying...");
            Console.Out.Flush();

            DirectoryReader    r             = DirectoryReader.Open(dir);
            int                expectedValue = 0;
            ByteArrayDataInput input         = new ByteArrayDataInput();

            foreach (AtomicReaderContext context in r.Leaves)
            {
                AtomicReader    reader  = context.AtomicReader;
                BytesRef        scratch = new BytesRef(bytes);
                BinaryDocValues dv      = reader.GetBinaryDocValues("dv");
                for (int i = 0; i < reader.MaxDoc; i++)
                {
                    dv.Get(i, scratch);
                    input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
                    Assert.AreEqual(expectedValue % 65535, input.ReadVInt32());
                    Assert.IsTrue(input.Eof);
                    expectedValue++;
                }
            }

            r.Dispose();
            dir.Dispose();
        }
        public virtual void TestVariableBinary([ValueSource(typeof(ConcurrentMergeSchedulers), "Values")]IConcurrentMergeScheduler scheduler)
        {
            BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BVariableBinary"));
            if (dir is MockDirectoryWrapper)
            {
                ((MockDirectoryWrapper)dir).Throttling = MockDirectoryWrapper.Throttling_e.NEVER;
            }

            var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))
                            .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
                            .SetRAMBufferSizeMB(256.0)
                            .SetMergeScheduler(scheduler)
                            .SetMergePolicy(NewLogMergePolicy(false, 10))
                            .SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE);
            IndexWriter w = new IndexWriter(dir, config);

            Document doc = new Document();
            var bytes = new byte[4];
            ByteArrayDataOutput encoder = new ByteArrayDataOutput(bytes);
            BytesRef data = new BytesRef(bytes);
            BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data);
            doc.Add(dvField);

            for (int i = 0; i < int.MaxValue; i++)
            {
                encoder.Reset(bytes);
                encoder.WriteVInt(i % 65535); // 1, 2, or 3 bytes
                data.Length = encoder.Position;
                w.AddDocument(doc);
                if (i % 100000 == 0)
                {
                    Console.WriteLine("indexed: " + i);
                    Console.Out.Flush();
                }
            }

            w.ForceMerge(1);
            w.Dispose();

            Console.WriteLine("verifying...");
            Console.Out.Flush();

            DirectoryReader r = DirectoryReader.Open(dir);
            int expectedValue = 0;
            ByteArrayDataInput input = new ByteArrayDataInput();
            foreach (AtomicReaderContext context in r.Leaves)
            {
                AtomicReader reader = context.AtomicReader;
                BytesRef scratch = new BytesRef(bytes);
                BinaryDocValues dv = reader.GetBinaryDocValues("dv");
                for (int i = 0; i < reader.MaxDoc; i++)
                {
                    dv.Get(i, scratch);
                    input.Reset((byte[])(Array)scratch.Bytes, scratch.Offset, scratch.Length);
                    Assert.AreEqual(expectedValue % 65535, input.ReadVInt());
                    Assert.IsTrue(input.Eof());
                    expectedValue++;
                }
            }

            r.Dispose();
            dir.Dispose();
        }
示例#13
0
        /// <summary>
        /// Parses a specific affix rule putting the result into the provided affix map
        /// </summary>
        /// <param name="affixes"> Map where the result of the parsing will be put </param>
        /// <param name="header"> Header line of the affix rule </param>
        /// <param name="reader"> BufferedReader to read the content of the rule from </param>
        /// <param name="conditionPattern"> <seealso cref="String#format(String, Object...)"/> pattern to be used to generate the condition regex
        ///                         pattern </param>
        /// <param name="seenPatterns"> map from condition -> index of patterns, for deduplication. </param>
        /// <exception cref="IOException"> Can be thrown while reading the rule </exception>
        private void ParseAffix(SortedDictionary <string, IList <char?> > affixes, string header, TextReader reader, string conditionPattern, IDictionary <string, int?> seenPatterns, IDictionary <string, int?> seenStrips)
        {
            BytesRef      scratch = new BytesRef();
            StringBuilder sb      = new StringBuilder();

            string[] args = whitespacePattern.Split(header);

            bool crossProduct = args[2].Equals("Y");

            int numLines = int.Parse(args[3], CultureInfo.InvariantCulture);

            affixData = ArrayUtil.Grow(affixData, (currentAffix << 3) + (numLines << 3));
            ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

            for (int i = 0; i < numLines; i++)
            {
                Debug.Assert(affixWriter.Position == currentAffix << 3);
                string   line     = reader.ReadLine();
                string[] ruleArgs = whitespacePattern.Split(line);

                // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
                // condition is optional
                if (ruleArgs.Length < 4)
                {
                    throw new Exception("The affix file contains a rule with less than four elements: " + line /*, reader.LineNumber */);// LUCENENET TODO: LineNumberReader
                }

                char   flag        = flagParsingStrategy.ParseFlag(ruleArgs[1]);
                string strip       = ruleArgs[2].Equals("0") ? "" : ruleArgs[2];
                string affixArg    = ruleArgs[3];
                char[] appendFlags = null;

                int flagSep = affixArg.LastIndexOf('/');
                if (flagSep != -1)
                {
                    string flagPart = affixArg.Substring(flagSep + 1);
                    affixArg = affixArg.Substring(0, flagSep - 0);

                    if (aliasCount > 0)
                    {
                        flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture));
                    }

                    appendFlags = flagParsingStrategy.ParseFlags(flagPart);
                    Array.Sort(appendFlags);
                    twoStageAffix = true;
                }

                // TODO: add test and fix zero-affix handling!

                string condition = ruleArgs.Length > 4 ? ruleArgs[4] : ".";
                // at least the gascon affix file has this issue
                if (condition.StartsWith("[", StringComparison.Ordinal) && !condition.EndsWith("]", StringComparison.Ordinal))
                {
                    condition = condition + "]";
                }
                // "dash hasn't got special meaning" (we must escape it)
                if (condition.IndexOf('-') >= 0)
                {
                    condition = condition.Replace("-", "\\-");
                }

                string regex;
                if (".".Equals(condition))
                {
                    regex = ".*"; // Zero condition is indicated by dot
                }
                else if (condition.Equals(strip))
                {
                    regex = ".*"; // TODO: optimize this better:
                                  // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                                  // but this is complicated...
                }
                else
                {
                    regex = string.Format(CultureInfo.InvariantCulture, conditionPattern, condition);
                }

                // deduplicate patterns
                int?patternIndex = seenPatterns.ContainsKey(regex) ? seenPatterns[regex] : null;
                if (patternIndex == null)
                {
                    patternIndex = patterns.Count;
                    if (patternIndex > short.MaxValue)
                    {
                        throw new System.NotSupportedException("Too many patterns, please report this to [email protected]");
                    }
                    seenPatterns[regex] = patternIndex;
                    CharacterRunAutomaton pattern = new CharacterRunAutomaton((new RegExp(regex, RegExp.NONE)).ToAutomaton());
                    patterns.Add(pattern);
                }

                int?stripOrd = seenStrips.ContainsKey(strip) ? seenStrips[strip] : null;
                if (stripOrd == null)
                {
                    stripOrd          = seenStrips.Count;
                    seenStrips[strip] = stripOrd;
                    if (stripOrd > char.MaxValue)
                    {
                        throw new System.NotSupportedException("Too many unique strips, please report this to [email protected]");
                    }
                }

                if (appendFlags == null)
                {
                    appendFlags = NOFLAGS;
                }

                EncodeFlags(scratch, appendFlags);
                int appendFlagsOrd = flagLookup.Add(scratch);
                if (appendFlagsOrd < 0)
                {
                    // already exists in our hash
                    appendFlagsOrd = (-appendFlagsOrd) - 1;
                }
                else if (appendFlagsOrd > short.MaxValue)
                {
                    // this limit is probably flexible, but its a good sanity check too
                    throw new System.NotSupportedException("Too many unique append flags, please report this to [email protected]");
                }

                affixWriter.WriteShort((short)flag);
                affixWriter.WriteShort((short)stripOrd);
                // encode crossProduct into patternIndex
                int patternOrd = (int)patternIndex << 1 | (crossProduct ? 1 : 0);
                affixWriter.WriteShort((short)patternOrd);
                affixWriter.WriteShort((short)appendFlagsOrd);

                if (needsInputCleaning)
                {
                    string cleaned = CleanInput(affixArg, sb);
                    affixArg = cleaned.ToString();
                }

                IList <char?> list = affixes.ContainsKey(affixArg) ? affixes[affixArg] : null;
                if (list == null)
                {
                    list = new List <char?>();
                    affixes[affixArg] = list;
                }

                list.Add((char)currentAffix);
                currentAffix++;
            }
        }
示例#14
0
        public override void Build(IInputEnumerator enumerator)
        {
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            string prefix     = this.GetType().Name;
            var    directory  = OfflineSorter.DefaultTempDir();
            var    tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            var    tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            hasPayloads = enumerator.HasPayloads;

            var writer = new OfflineSorter.ByteSequencesWriter(tempInput);

            OfflineSorter.ByteSequencesReader reader = null;
            var scratch = new BytesRef();

            TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton();

            bool success = false;

            count = 0;
            byte[] buffer = new byte[8];
            try
            {
                var      output = new ByteArrayDataOutput(buffer);
                BytesRef surfaceForm;

                while (enumerator.MoveNext())
                {
                    surfaceForm = enumerator.Current;
                    ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a);

                    maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count);

                    foreach (Int32sRef path in paths)
                    {
                        Util.Fst.Util.ToBytesRef(path, scratch);

                        // length of the analyzed text (FST input)
                        if (scratch.Length > ushort.MaxValue - 2)
                        {
                            throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) +
                                                        " in length (got " + scratch.Length + ")");
                        }
                        ushort analyzedLength = (ushort)scratch.Length;

                        // compute the required length:
                        // analyzed sequence + weight (4) + surface + analyzedLength (short)
                        int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2;

                        BytesRef payload;

                        if (hasPayloads)
                        {
                            if (surfaceForm.Length > (ushort.MaxValue - 2))
                            {
                                throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) +
                                                            " in length (got " + surfaceForm.Length + ")");
                            }
                            payload = enumerator.Payload;
                            // payload + surfaceLength (short)
                            requiredLength += payload.Length + 2;
                        }
                        else
                        {
                            payload = null;
                        }

                        buffer = ArrayUtil.Grow(buffer, requiredLength);

                        output.Reset(buffer);

                        output.WriteInt16((short)analyzedLength);

                        output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length);

                        output.WriteInt32(EncodeWeight(enumerator.Weight));

                        if (hasPayloads)
                        {
                            for (int i = 0; i < surfaceForm.Length; i++)
                            {
                                if (surfaceForm.Bytes[i] == PAYLOAD_SEP)
                                {
                                    throw new ArgumentException(
                                              "surface form cannot contain unit separator character U+001F; this character is reserved");
                                }
                            }
                            output.WriteInt16((short)surfaceForm.Length);
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                            output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                        }
                        else
                        {
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                        }

                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(output.Position == requiredLength, () => output.Position + " vs " + requiredLength);
                        }

                        writer.Write(buffer, 0, output.Position);
                    }
                    count++;
                }
                writer.Dispose();

                // Sort all input/output pairs (required by FST.Builder):
                (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted);

                // Free disk space:
                tempInput.Delete();

                reader = new OfflineSorter.ByteSequencesReader(tempSorted);

                var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton,
                                                                ByteSequenceOutputs.Singleton);
                var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);

                // Build FST:
                BytesRef  previousAnalyzed = null;
                BytesRef  analyzed         = new BytesRef();
                BytesRef  surface          = new BytesRef();
                Int32sRef scratchInts      = new Int32sRef();
                var       input            = new ByteArrayDataInput();

                // Used to remove duplicate surface forms (but we
                // still index the hightest-weight one).  We clear
                // this when we see a new analyzed form, so it cannot
                // grow unbounded (at most 256 entries):
                var seenSurfaceForms = new JCG.HashSet <BytesRef>();

                var dedup = 0;
                while (reader.Read(scratch))
                {
                    input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
                    ushort analyzedLength = (ushort)input.ReadInt16();
                    analyzed.Grow(analyzedLength + 2);
                    input.ReadBytes(analyzed.Bytes, 0, analyzedLength);
                    analyzed.Length = analyzedLength;

                    long cost = input.ReadInt32();

                    surface.Bytes = scratch.Bytes;
                    if (hasPayloads)
                    {
                        surface.Length = (ushort)input.ReadInt16();
                        surface.Offset = input.Position;
                    }
                    else
                    {
                        surface.Offset = input.Position;
                        surface.Length = scratch.Length - surface.Offset;
                    }

                    if (previousAnalyzed == null)
                    {
                        previousAnalyzed = new BytesRef();
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else if (analyzed.Equals(previousAnalyzed))
                    {
                        dedup++;
                        if (dedup >= maxSurfaceFormsPerAnalyzedForm)
                        {
                            // More than maxSurfaceFormsPerAnalyzedForm
                            // dups: skip the rest:
                            continue;
                        }
                        if (seenSurfaceForms.Contains(surface))
                        {
                            continue;
                        }
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else
                    {
                        dedup = 0;
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Clear();
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }

                    // TODO: I think we can avoid the extra 2 bytes when
                    // there is no dup (dedup==0), but we'd have to fix
                    // the exactFirst logic ... which would be sort of
                    // hairy because we'd need to special case the two
                    // (dup/not dup)...

                    // NOTE: must be byte 0 so we sort before whatever
                    // is next
                    analyzed.Bytes[analyzed.Offset + analyzed.Length]     = 0;
                    analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup;
                    analyzed.Length += 2;

                    Util.Fst.Util.ToInt32sRef(analyzed, scratchInts);
                    //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
                    if (!hasPayloads)
                    {
                        builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface)));
                    }
                    else
                    {
                        int      payloadOffset = input.Position + surface.Length;
                        int      payloadLength = scratch.Length - payloadOffset;
                        BytesRef br            = new BytesRef(surface.Length + 1 + payloadLength);
                        Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length);
                        br.Bytes[surface.Length] = PAYLOAD_SEP;
                        Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength);
                        br.Length = br.Bytes.Length;
                        builder.Add(scratchInts, outputs.NewPair(cost, br));
                    }
                }
                fst = builder.Finish();

                //Util.dotToFile(fst, "/tmp/suggest.dot");

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(reader, writer);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(reader, writer);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
示例#15
0
            /// <summary>
            /// Builds an <seealso cref="SynonymMap"/> and returns it.
            /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public SynonymMap build() throws java.io.IOException
            public virtual SynonymMap build()
            {
                ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;

                // TODO: are we using the best sharing options?
                org.apache.lucene.util.fst.Builder <BytesRef> builder = new org.apache.lucene.util.fst.Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);

                BytesRef            scratch       = new BytesRef(64);
                ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final java.util.Set<Integer> dedupSet;
                HashSet <int?> dedupSet;

                if (dedup)
                {
                    dedupSet = new HashSet <>();
                }
                else
                {
                    dedupSet = null;
                }

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final byte[] spare = new byte[5];
                sbyte[] spare = new sbyte[5];

                Dictionary <CharsRef, MapEntry> .KeyCollection keys = workingSet.Keys;
                CharsRef[] sortedKeys = keys.toArray(new CharsRef[keys.size()]);
                Arrays.sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator);

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratchIntsRef = new org.apache.lucene.util.IntsRef();
                IntsRef scratchIntsRef = new IntsRef();

                //System.out.println("fmap.build");
                for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++)
                {
                    CharsRef input  = sortedKeys[keyIdx];
                    MapEntry output = workingSet[input];

                    int numEntries = output.ords.Count;
                    // output size, assume the worst case
                    int estimatedSize = 5 + numEntries * 5;     // numEntries + one ord for each entry

                    scratch.grow(estimatedSize);
                    scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
                    Debug.Assert(scratch.offset == 0);

                    // now write our output data:
                    int count = 0;
                    for (int i = 0; i < numEntries; i++)
                    {
                        if (dedupSet != null)
                        {
                            // box once
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final Integer ent = output.ords.get(i);
                            int?ent = output.ords[i];
                            if (dedupSet.Contains(ent))
                            {
                                continue;
                            }
                            dedupSet.Add(ent);
                        }
                        scratchOutput.writeVInt(output.ords[i]);
                        count++;
                    }

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int pos = scratchOutput.getPosition();
                    int pos = scratchOutput.Position;
                    scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int pos2 = scratchOutput.getPosition();
                    int pos2 = scratchOutput.Position;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int vIntLen = pos2-pos;
                    int vIntLen = pos2 - pos;

                    // Move the count + includeOrig to the front of the byte[]:
                    Array.Copy(scratch.bytes, pos, spare, 0, vIntLen);
                    Array.Copy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
                    Array.Copy(spare, 0, scratch.bytes, 0, vIntLen);

                    if (dedupSet != null)
                    {
                        dedupSet.Clear();
                    }

                    scratch.length = scratchOutput.Position - scratch.offset;
                    //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
                    builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch));
                }

                FST <BytesRef> fst = builder.finish();

                return(new SynonymMap(fst, words, maxHorizontalContext));
            }
示例#16
0
            /// <summary>
            /// Builds an <see cref="SynonymMap"/> and returns it.
            /// </summary>
            public virtual SynonymMap Build()
            {
                ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
                // TODO: are we using the best sharing options?
                var builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);

                BytesRef            scratch       = new BytesRef(64);
                ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

                HashSet <int?> dedupSet;

                if (dedup)
                {
                    dedupSet = new HashSet <int?>();
                }
                else
                {
                    dedupSet = null;
                }


                var spare = new byte[5];

                ICollection <CharsRef> keys = workingSet.Keys;

                CharsRef[] sortedKeys = keys.ToArray();
#pragma warning disable 612, 618
                System.Array.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparer);
#pragma warning restore 612, 618


                Int32sRef scratchIntsRef = new Int32sRef();

                //System.out.println("fmap.build");
                for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++)
                {
                    CharsRef input  = sortedKeys[keyIdx];
                    MapEntry output = workingSet[input];

                    int numEntries = output.ords.Count;
                    // output size, assume the worst case
                    int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

                    scratch.Grow(estimatedSize);
                    scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length);
                    Debug.Assert(scratch.Offset == 0);

                    // now write our output data:
                    int count = 0;
                    for (int i = 0; i < numEntries; i++)
                    {
                        if (dedupSet != null)
                        {
                            // box once
                            int?ent = output.ords[i];
                            if (dedupSet.Contains(ent))
                            {
                                continue;
                            }
                            dedupSet.Add(ent);
                        }
                        scratchOutput.WriteVInt32(output.ords[i]);
                        count++;
                    }

                    int pos = scratchOutput.Position;
                    scratchOutput.WriteVInt32(count << 1 | (output.includeOrig ? 0 : 1));
                    int pos2    = scratchOutput.Position;
                    int vIntLen = pos2 - pos;

                    // Move the count + includeOrig to the front of the byte[]:
                    Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen);
                    Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos);
                    Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen);

                    if (dedupSet != null)
                    {
                        dedupSet.Clear();
                    }

                    scratch.Length = scratchOutput.Position - scratch.Offset;
                    //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
                    builder.Add(Lucene.Net.Util.Fst.Util.ToUTF32(input.ToString(), scratchIntsRef), BytesRef.DeepCopyOf(scratch));
                }

                FST <BytesRef> fst = builder.Finish();
                return(new SynonymMap(fst, words, maxHorizontalContext));
            }
示例#17
0
		/// <summary>
		/// Builds an <seealso cref="SynonymMap"/> and returns it.
		/// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public SynonymMap build() throws java.io.IOException
		public virtual SynonymMap build()
		{
		  ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
		  // TODO: are we using the best sharing options?
		  org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);

		  BytesRef scratch = new BytesRef(64);
		  ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final java.util.Set<Integer> dedupSet;
		  HashSet<int?> dedupSet;

		  if (dedup)
		  {
			dedupSet = new HashSet<>();
		  }
		  else
		  {
			dedupSet = null;
		  }

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final byte[] spare = new byte[5];
		  sbyte[] spare = new sbyte[5];

		  Dictionary<CharsRef, MapEntry>.KeyCollection keys = workingSet.Keys;
		  CharsRef[] sortedKeys = keys.toArray(new CharsRef[keys.size()]);
		  Arrays.sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator);

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratchIntsRef = new org.apache.lucene.util.IntsRef();
		  IntsRef scratchIntsRef = new IntsRef();

		  //System.out.println("fmap.build");
		  for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++)
		  {
			CharsRef input = sortedKeys[keyIdx];
			MapEntry output = workingSet[input];

			int numEntries = output.ords.Count;
			// output size, assume the worst case
			int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

			scratch.grow(estimatedSize);
			scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
			Debug.Assert(scratch.offset == 0);

			// now write our output data:
			int count = 0;
			for (int i = 0; i < numEntries; i++)
			{
			  if (dedupSet != null)
			  {
				// box once
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final Integer ent = output.ords.get(i);
				int? ent = output.ords[i];
				if (dedupSet.Contains(ent))
				{
				  continue;
				}
				dedupSet.Add(ent);
			  }
			  scratchOutput.writeVInt(output.ords[i]);
			  count++;
			}

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int pos = scratchOutput.getPosition();
			int pos = scratchOutput.Position;
			scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int pos2 = scratchOutput.getPosition();
			int pos2 = scratchOutput.Position;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int vIntLen = pos2-pos;
			int vIntLen = pos2 - pos;

			// Move the count + includeOrig to the front of the byte[]:
			Array.Copy(scratch.bytes, pos, spare, 0, vIntLen);
			Array.Copy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
			Array.Copy(spare, 0, scratch.bytes, 0, vIntLen);

			if (dedupSet != null)
			{
			  dedupSet.Clear();
			}

			scratch.length = scratchOutput.Position - scratch.offset;
			//System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
			builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch));
		  }

		  FST<BytesRef> fst = builder.finish();
		  return new SynonymMap(fst, words, maxHorizontalContext);
		}