public override void BeforeClass() { base.BeforeClass(); Random random = Random; INTS = new int[COUNT]; LONGS = new long[COUNT]; RANDOM_TEST_BYTES = new byte[COUNT * (5 + 4 + 9 + 8)]; ByteArrayDataOutput bdo = new ByteArrayDataOutput(RANDOM_TEST_BYTES); for (int i = 0; i < COUNT; i++) { int i1 = INTS[i] = random.Next(); bdo.WriteVInt32(i1); bdo.WriteInt32(i1); long l1; if (Rarely()) { // a long with lots of zeroes at the end l1 = LONGS[i] = TestUtil.NextInt64(random, 0, int.MaxValue) << 32; } else { l1 = LONGS[i] = TestUtil.NextInt64(random, 0, long.MaxValue); } bdo.WriteVInt64(l1); bdo.WriteInt64(l1); } }
internal static byte[] Compress(Compressor compressor, byte[] decompressed, int off, int len) { var compressed = new byte[len * 2 + 16]; // should be enough ByteArrayDataOutput @out = new ByteArrayDataOutput(compressed); compressor.Compress(decompressed, off, len, @out); int compressedLen = @out.Position; return Arrays.CopyOf(compressed, compressedLen); }
internal static sbyte[] Compress(Compressor compressor, sbyte[] decompressed, int off, int len) { sbyte[] compressed = new sbyte[len * 2 + 16]; // should be enough ByteArrayDataOutput @out = new ByteArrayDataOutput((byte[])(Array)compressed); compressor.Compress(decompressed, off, len, @out); int compressedLen = @out.Position; return(Arrays.CopyOf(compressed, compressedLen)); }
/// <summary> /// encodes an entry (bytes+weight) to the provided writer /// </summary> protected internal virtual void Encode(OfflineSorter.ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) { if (spare.Length + 8 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 8); } output.Reset(buffer); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); output.WriteInt64(weight); writer.Write(buffer, 0, output.Position); }
private OfflineSorter.ByteSequencesReader Sort() { string prefix = this.GetType().Name; DirectoryInfo directory = OfflineSorter.DefaultTempDir(); tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); var writer = new OfflineSorter.ByteSequencesWriter(tempInput); bool success = false; try { BytesRef spare; byte[] buffer = new byte[0]; var output = new ByteArrayDataOutput(buffer); while ((spare = source.Next()) != null) { Encode(writer, output, buffer, spare, source.Payload, source.Contexts, source.Weight); } writer.Dispose(); (new OfflineSorter(tieBreakByCostComparer)).Sort(tempInput, tempSorted); var reader = new OfflineSorter.ByteSequencesReader(tempSorted); success = true; return(reader); } finally { if (success) { IOUtils.Dispose(writer); } else { try { IOUtils.DisposeWhileHandlingException(writer); } finally { Dispose(); } } } }
private OfflineSorter.ByteSequencesReader Sort() { string prefix = this.GetType().Name; DirectoryInfo directory = OfflineSorter.DefaultTempDir(); tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); var writer = new OfflineSorter.ByteSequencesWriter(tempInput); bool success = false; try { byte[] buffer = Arrays.Empty <byte>(); ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); while (source.MoveNext()) { Encode(writer, output, buffer, source.Current, source.Weight); } writer.Dispose(); (new OfflineSorter(tieBreakByCostComparer)).Sort(tempInput, tempSorted); OfflineSorter.ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(tempSorted); success = true; return(reader); } finally { if (success) { IOUtils.Dispose(writer); } else { try { IOUtils.DisposeWhileHandlingException(writer); } finally { Close(); } } } }
/// <summary> /// encodes an entry (bytes+(contexts)+(payload)+weight) to the provided writer /// </summary> protected internal virtual void Encode(OfflineSorter.ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, BytesRef payload, IEnumerable <BytesRef> contexts, long weight) { int requiredLength = spare.Length + 8 + ((hasPayloads) ? 2 + payload.Length : 0); if (hasContexts) { foreach (BytesRef ctx in contexts) { requiredLength += 2 + ctx.Length; } requiredLength += 2; // for length of contexts } if (requiredLength >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, requiredLength); } output.Reset(buffer); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); if (hasContexts) { foreach (BytesRef ctx in contexts) { output.WriteBytes(ctx.Bytes, ctx.Offset, ctx.Length); output.WriteInt16((short)ctx.Length); } output.WriteInt16((short)contexts.Count()); } if (hasPayloads) { output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); output.WriteInt16((short)payload.Length); } output.WriteInt64(weight); writer.Write(buffer, 0, output.Position); }
protected internal override void Encode(OfflineSorter.ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, BytesRef payload, ICollection <BytesRef> contexts, long weight) { if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); output.WriteInt32(EncodeWeight(weight)); writer.Write(buffer, 0, output.Position); }
public override void Build(IInputIterator iterator) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir()); FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir()); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. bool success = false; count = 0; try { byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = iterator.Next()) != null) { if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteInt32(EncodeWeight(iterator.Weight)); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); writer.Write(buffer, 0, output.Position); } writer.Dispose(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted); tempInput.Delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); int inputLines = info.Lines; reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.Read(tmp1)) { input.Reset(tmp1.Bytes); int currentScore = input.ReadInt32(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int)(line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.Bytes = tmp1.Bytes; tmp2.Offset = input.Position; tmp2.Length = tmp1.Length - input.Position; builder.Add(tmp2, bucket); line++; count++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.Build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst); success = true; } finally { if (success) { IOUtils.Close(reader, writer, sorter); } else { IOUtils.CloseWhileHandlingException(reader, writer, sorter); } tempInput.Delete(); tempSorted.Delete(); } }
public FixedLengthArcsBuffer() { this.bytes = new byte[11]; this.bado = new ByteArrayDataOutput(bytes); }
public virtual void TestVariableBinary([ValueSource(typeof(ConcurrentMergeSchedulerFactories), "Values")] Func <IConcurrentMergeScheduler> newScheduler) { BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BVariableBinary")); if (dir is MockDirectoryWrapper) { ((MockDirectoryWrapper)dir).Throttling = Throttling.NEVER; } var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)) .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .SetRAMBufferSizeMB(256.0) .SetMergeScheduler(newScheduler()) .SetMergePolicy(NewLogMergePolicy(false, 10)) .SetOpenMode(OpenMode.CREATE); IndexWriter w = new IndexWriter(dir, config); Document doc = new Document(); var bytes = new byte[4]; ByteArrayDataOutput encoder = new ByteArrayDataOutput(bytes); BytesRef data = new BytesRef(bytes); BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data); doc.Add(dvField); for (int i = 0; i < int.MaxValue; i++) { encoder.Reset(bytes); encoder.WriteVInt32(i % 65535); // 1, 2, or 3 bytes data.Length = encoder.Position; w.AddDocument(doc); if (i % 100000 == 0) { Console.WriteLine("indexed: " + i); Console.Out.Flush(); } } w.ForceMerge(1); w.Dispose(); Console.WriteLine("verifying..."); Console.Out.Flush(); DirectoryReader r = DirectoryReader.Open(dir); int expectedValue = 0; ByteArrayDataInput input = new ByteArrayDataInput(); foreach (AtomicReaderContext context in r.Leaves) { AtomicReader reader = context.AtomicReader; BytesRef scratch = new BytesRef(bytes); BinaryDocValues dv = reader.GetBinaryDocValues("dv"); for (int i = 0; i < reader.MaxDoc; i++) { dv.Get(i, scratch); input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); Assert.AreEqual(expectedValue % 65535, input.ReadVInt32()); Assert.IsTrue(input.Eof); expectedValue++; } } r.Dispose(); dir.Dispose(); }
public virtual void TestVariableBinary([ValueSource(typeof(ConcurrentMergeSchedulers), "Values")]IConcurrentMergeScheduler scheduler) { BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BVariableBinary")); if (dir is MockDirectoryWrapper) { ((MockDirectoryWrapper)dir).Throttling = MockDirectoryWrapper.Throttling_e.NEVER; } var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())) .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) .SetRAMBufferSizeMB(256.0) .SetMergeScheduler(scheduler) .SetMergePolicy(NewLogMergePolicy(false, 10)) .SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE); IndexWriter w = new IndexWriter(dir, config); Document doc = new Document(); var bytes = new byte[4]; ByteArrayDataOutput encoder = new ByteArrayDataOutput(bytes); BytesRef data = new BytesRef(bytes); BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data); doc.Add(dvField); for (int i = 0; i < int.MaxValue; i++) { encoder.Reset(bytes); encoder.WriteVInt(i % 65535); // 1, 2, or 3 bytes data.Length = encoder.Position; w.AddDocument(doc); if (i % 100000 == 0) { Console.WriteLine("indexed: " + i); Console.Out.Flush(); } } w.ForceMerge(1); w.Dispose(); Console.WriteLine("verifying..."); Console.Out.Flush(); DirectoryReader r = DirectoryReader.Open(dir); int expectedValue = 0; ByteArrayDataInput input = new ByteArrayDataInput(); foreach (AtomicReaderContext context in r.Leaves) { AtomicReader reader = context.AtomicReader; BytesRef scratch = new BytesRef(bytes); BinaryDocValues dv = reader.GetBinaryDocValues("dv"); for (int i = 0; i < reader.MaxDoc; i++) { dv.Get(i, scratch); input.Reset((byte[])(Array)scratch.Bytes, scratch.Offset, scratch.Length); Assert.AreEqual(expectedValue % 65535, input.ReadVInt()); Assert.IsTrue(input.Eof()); expectedValue++; } } r.Dispose(); dir.Dispose(); }
/// <summary> /// Parses a specific affix rule putting the result into the provided affix map /// </summary> /// <param name="affixes"> Map where the result of the parsing will be put </param> /// <param name="header"> Header line of the affix rule </param> /// <param name="reader"> BufferedReader to read the content of the rule from </param> /// <param name="conditionPattern"> <seealso cref="String#format(String, Object...)"/> pattern to be used to generate the condition regex /// pattern </param> /// <param name="seenPatterns"> map from condition -> index of patterns, for deduplication. </param> /// <exception cref="IOException"> Can be thrown while reading the rule </exception> private void ParseAffix(SortedDictionary <string, IList <char?> > affixes, string header, TextReader reader, string conditionPattern, IDictionary <string, int?> seenPatterns, IDictionary <string, int?> seenStrips) { BytesRef scratch = new BytesRef(); StringBuilder sb = new StringBuilder(); string[] args = whitespacePattern.Split(header); bool crossProduct = args[2].Equals("Y"); int numLines = int.Parse(args[3], CultureInfo.InvariantCulture); affixData = ArrayUtil.Grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { Debug.Assert(affixWriter.Position == currentAffix << 3); string line = reader.ReadLine(); string[] ruleArgs = whitespacePattern.Split(line); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.Length < 4) { throw new Exception("The affix file contains a rule with less than four elements: " + line /*, reader.LineNumber */);// LUCENENET TODO: LineNumberReader } char flag = flagParsingStrategy.ParseFlag(ruleArgs[1]); string strip = ruleArgs[2].Equals("0") ? "" : ruleArgs[2]; string affixArg = ruleArgs[3]; char[] appendFlags = null; int flagSep = affixArg.LastIndexOf('/'); if (flagSep != -1) { string flagPart = affixArg.Substring(flagSep + 1); affixArg = affixArg.Substring(0, flagSep - 0); if (aliasCount > 0) { flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture)); } appendFlags = flagParsingStrategy.ParseFlags(flagPart); Array.Sort(appendFlags); twoStageAffix = true; } // TODO: add test and fix zero-affix handling! string condition = ruleArgs.Length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.StartsWith("[", StringComparison.Ordinal) && !condition.EndsWith("]", StringComparison.Ordinal)) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.IndexOf('-') >= 0) { condition = condition.Replace("-", "\\-"); } string regex; if (".".Equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.Equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = string.Format(CultureInfo.InvariantCulture, conditionPattern, condition); } // deduplicate patterns int?patternIndex = seenPatterns.ContainsKey(regex) ? seenPatterns[regex] : null; if (patternIndex == null) { patternIndex = patterns.Count; if (patternIndex > short.MaxValue) { throw new System.NotSupportedException("Too many patterns, please report this to [email protected]"); } seenPatterns[regex] = patternIndex; CharacterRunAutomaton pattern = new CharacterRunAutomaton((new RegExp(regex, RegExp.NONE)).ToAutomaton()); patterns.Add(pattern); } int?stripOrd = seenStrips.ContainsKey(strip) ? seenStrips[strip] : null; if (stripOrd == null) { stripOrd = seenStrips.Count; seenStrips[strip] = stripOrd; if (stripOrd > char.MaxValue) { throw new System.NotSupportedException("Too many unique strips, please report this to [email protected]"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } EncodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.Add(scratch); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > short.MaxValue) { // this limit is probably flexible, but its a good sanity check too throw new System.NotSupportedException("Too many unique append flags, please report this to [email protected]"); } affixWriter.WriteShort((short)flag); affixWriter.WriteShort((short)stripOrd); // encode crossProduct into patternIndex int patternOrd = (int)patternIndex << 1 | (crossProduct ? 1 : 0); affixWriter.WriteShort((short)patternOrd); affixWriter.WriteShort((short)appendFlagsOrd); if (needsInputCleaning) { string cleaned = CleanInput(affixArg, sb); affixArg = cleaned.ToString(); } IList <char?> list = affixes.ContainsKey(affixArg) ? affixes[affixArg] : null; if (list == null) { list = new List <char?>(); affixes[affixArg] = list; } list.Add((char)currentAffix); currentAffix++; } }
public override void Build(IInputEnumerator enumerator) { if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); hasPayloads = enumerator.HasPayloads; var writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; var scratch = new BytesRef(); TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton(); bool success = false; count = 0; byte[] buffer = new byte[8]; try { var output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while (enumerator.MoveNext()) { surfaceForm = enumerator.Current; ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a); maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); foreach (Int32sRef path in paths) { Util.Fst.Util.ToBytesRef(path, scratch); // length of the analyzed text (FST input) if (scratch.Length > ushort.MaxValue - 2) { throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) + " in length (got " + scratch.Length + ")"); } ushort analyzedLength = (ushort)scratch.Length; // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.Length > (ushort.MaxValue - 2)) { throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) + " in length (got " + surfaceForm.Length + ")"); } payload = enumerator.Payload; // payload + surfaceLength (short) requiredLength += payload.Length + 2; } else { payload = null; } buffer = ArrayUtil.Grow(buffer, requiredLength); output.Reset(buffer); output.WriteInt16((short)analyzedLength); output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); output.WriteInt32(EncodeWeight(enumerator.Weight)); if (hasPayloads) { for (int i = 0; i < surfaceForm.Length; i++) { if (surfaceForm.Bytes[i] == PAYLOAD_SEP) { throw new ArgumentException( "surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.WriteInt16((short)surfaceForm.Length); output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); } if (Debugging.AssertsEnabled) { Debugging.Assert(output.Position == requiredLength, () => output.Position + " vs " + requiredLength); } writer.Write(buffer, 0, output.Position); } count++; } writer.Dispose(); // Sort all input/output pairs (required by FST.Builder): (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted); // Free disk space: tempInput.Delete(); reader = new OfflineSorter.ByteSequencesReader(tempSorted); var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton, ByteSequenceOutputs.Singleton); var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); Int32sRef scratchInts = new Int32sRef(); var input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): var seenSurfaceForms = new JCG.HashSet <BytesRef>(); var dedup = 0; while (reader.Read(scratch)) { input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); ushort analyzedLength = (ushort)input.ReadInt16(); analyzed.Grow(analyzedLength + 2); input.ReadBytes(analyzed.Bytes, 0, analyzedLength); analyzed.Length = analyzedLength; long cost = input.ReadInt32(); surface.Bytes = scratch.Bytes; if (hasPayloads) { surface.Length = (ushort)input.ReadInt16(); surface.Offset = input.Position; } else { surface.Offset = input.Position; surface.Length = scratch.Length - surface.Offset; } if (previousAnalyzed == null) { previousAnalyzed = new BytesRef(); previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else if (analyzed.Equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.Contains(surface)) { continue; } seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Clear(); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup; analyzed.Length += 2; Util.Fst.Util.ToInt32sRef(analyzed, scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); } else { int payloadOffset = input.Position + surface.Length; int payloadLength = scratch.Length - payloadOffset; BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); br.Bytes[surface.Length] = PAYLOAD_SEP; Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); br.Length = br.Bytes.Length; builder.Add(scratchInts, outputs.NewPair(cost, br)); } } fst = builder.Finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer); } else { IOUtils.DisposeWhileHandlingException(reader, writer); } tempInput.Delete(); tempSorted.Delete(); } }
/// <summary> /// Builds an <seealso cref="SynonymMap"/> and returns it. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public SynonymMap build() throws java.io.IOException public virtual SynonymMap build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; // TODO: are we using the best sharing options? org.apache.lucene.util.fst.Builder <BytesRef> builder = new org.apache.lucene.util.fst.Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final java.util.Set<Integer> dedupSet; HashSet <int?> dedupSet; if (dedup) { dedupSet = new HashSet <>(); } else { dedupSet = null; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final byte[] spare = new byte[5]; sbyte[] spare = new sbyte[5]; Dictionary <CharsRef, MapEntry> .KeyCollection keys = workingSet.Keys; CharsRef[] sortedKeys = keys.toArray(new CharsRef[keys.size()]); Arrays.sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratchIntsRef = new org.apache.lucene.util.IntsRef(); IntsRef scratchIntsRef = new IntsRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet[input]; int numEntries = output.ords.Count; // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.grow(estimatedSize); scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length); Debug.Assert(scratch.offset == 0); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final Integer ent = output.ords.get(i); int?ent = output.ords[i]; if (dedupSet.Contains(ent)) { continue; } dedupSet.Add(ent); } scratchOutput.writeVInt(output.ords[i]); count++; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int pos = scratchOutput.getPosition(); int pos = scratchOutput.Position; scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int pos2 = scratchOutput.getPosition(); int pos2 = scratchOutput.Position; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int vIntLen = pos2-pos; int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: Array.Copy(scratch.bytes, pos, spare, 0, vIntLen); Array.Copy(scratch.bytes, 0, scratch.bytes, vIntLen, pos); Array.Copy(spare, 0, scratch.bytes, 0, vIntLen); if (dedupSet != null) { dedupSet.Clear(); } scratch.length = scratchOutput.Position - scratch.offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch)); } FST <BytesRef> fst = builder.finish(); return(new SynonymMap(fst, words, maxHorizontalContext)); }
/// <summary> /// Builds an <see cref="SynonymMap"/> and returns it. /// </summary> public virtual SynonymMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; // TODO: are we using the best sharing options? var builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); HashSet <int?> dedupSet; if (dedup) { dedupSet = new HashSet <int?>(); } else { dedupSet = null; } var spare = new byte[5]; ICollection <CharsRef> keys = workingSet.Keys; CharsRef[] sortedKeys = keys.ToArray(); #pragma warning disable 612, 618 System.Array.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparer); #pragma warning restore 612, 618 Int32sRef scratchIntsRef = new Int32sRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet[input]; int numEntries = output.ords.Count; // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.Grow(estimatedSize); scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length); Debug.Assert(scratch.Offset == 0); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once int?ent = output.ords[i]; if (dedupSet.Contains(ent)) { continue; } dedupSet.Add(ent); } scratchOutput.WriteVInt32(output.ords[i]); count++; } int pos = scratchOutput.Position; scratchOutput.WriteVInt32(count << 1 | (output.includeOrig ? 0 : 1)); int pos2 = scratchOutput.Position; int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen); Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos); Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen); if (dedupSet != null) { dedupSet.Clear(); } scratch.Length = scratchOutput.Position - scratch.Offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); builder.Add(Lucene.Net.Util.Fst.Util.ToUTF32(input.ToString(), scratchIntsRef), BytesRef.DeepCopyOf(scratch)); } FST <BytesRef> fst = builder.Finish(); return(new SynonymMap(fst, words, maxHorizontalContext)); }
/// <summary> /// Builds an <seealso cref="SynonymMap"/> and returns it. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public SynonymMap build() throws java.io.IOException public virtual SynonymMap build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; // TODO: are we using the best sharing options? org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final java.util.Set<Integer> dedupSet; HashSet<int?> dedupSet; if (dedup) { dedupSet = new HashSet<>(); } else { dedupSet = null; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final byte[] spare = new byte[5]; sbyte[] spare = new sbyte[5]; Dictionary<CharsRef, MapEntry>.KeyCollection keys = workingSet.Keys; CharsRef[] sortedKeys = keys.toArray(new CharsRef[keys.size()]); Arrays.sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratchIntsRef = new org.apache.lucene.util.IntsRef(); IntsRef scratchIntsRef = new IntsRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet[input]; int numEntries = output.ords.Count; // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.grow(estimatedSize); scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length); Debug.Assert(scratch.offset == 0); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final Integer ent = output.ords.get(i); int? ent = output.ords[i]; if (dedupSet.Contains(ent)) { continue; } dedupSet.Add(ent); } scratchOutput.writeVInt(output.ords[i]); count++; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int pos = scratchOutput.getPosition(); int pos = scratchOutput.Position; scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int pos2 = scratchOutput.getPosition(); int pos2 = scratchOutput.Position; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int vIntLen = pos2-pos; int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: Array.Copy(scratch.bytes, pos, spare, 0, vIntLen); Array.Copy(scratch.bytes, 0, scratch.bytes, vIntLen, pos); Array.Copy(spare, 0, scratch.bytes, 0, vIntLen); if (dedupSet != null) { dedupSet.Clear(); } scratch.length = scratchOutput.Position - scratch.offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch)); } FST<BytesRef> fst = builder.finish(); return new SynonymMap(fst, words, maxHorizontalContext); }