/// <summary> /// Creates a new sorted wrapper, sorting by BytesRef /// (ascending) then cost (ascending). /// </summary> public SortedTermFreqIteratorWrapper(ITermFreqIterator source, IComparer <BytesRef> comparer) { this.source = source; this.comparer = comparer; this.reader = Sort(); this.tieBreakByCostComparer = Comparer <BytesRef> .Create((left, right) => { SortedTermFreqIteratorWrapper outerInstance = this; BytesRef leftScratch = new BytesRef(); BytesRef rightScratch = new BytesRef(); ByteArrayDataInput input = new ByteArrayDataInput(); // Make shallow copy in case decode changes the BytesRef: leftScratch.Bytes = left.Bytes; leftScratch.Offset = left.Offset; leftScratch.Length = left.Length; rightScratch.Bytes = right.Bytes; rightScratch.Offset = right.Offset; rightScratch.Length = right.Length; long leftCost = outerInstance.Decode(leftScratch, input); long rightCost = outerInstance.Decode(rightScratch, input); int cmp = outerInstance.comparer.Compare(leftScratch, rightScratch); if (cmp != 0) { return(cmp); } return(leftCost.CompareTo(rightCost)); }); }
/// <summary> /// Creates a new sorted wrapper, sorting by BytesRef /// (ascending) then cost (ascending). /// </summary> public SortedTermFreqIteratorWrapper(ITermFreqIterator source, IComparer <BytesRef> comparer) { this.source = source; this.comparer = comparer; this.reader = Sort(); this.tieBreakByCostComparer = new ComparerAnonymousInnerClassHelper(this); }
/// <summary> /// Creates a new sorted wrapper, sorting by BytesRef /// (ascending) then cost (ascending). /// </summary> public SortedInputIterator(InputIterator source, IComparer <BytesRef> comparator) { this.hasPayloads = source.HasPayloads; this.hasContexts = source.HasContexts; this.source = source; this.comparator = comparator; this.reader = Sort(); }
/// <summary> /// Creates a new sorted wrapper, sorting by BytesRef /// (ascending) then cost (ascending). /// </summary> public SortedInputIterator(IInputIterator source, IComparer <BytesRef> comparator) { this.tieBreakByCostComparator = new ComparatorAnonymousInnerClassHelper(this); this.hasPayloads = source.HasPayloads; this.hasContexts = source.HasContexts; this.source = source; this.comparator = comparator; this.reader = Sort(); }
/// <summary> /// Creates a new sorted wrapper, sorting by BytesRef /// (ascending) then cost (ascending). /// </summary> public SortedInputIterator(IInputIterator source, IComparer <BytesRef> comparer) { this.tieBreakByCostComparer = Comparer <BytesRef> .Create((left, right) => { SortedInputIterator outerInstance = this; BytesRef leftScratch = new BytesRef(); BytesRef rightScratch = new BytesRef(); ByteArrayDataInput input = new ByteArrayDataInput(); // Make shallow copy in case decode changes the BytesRef: leftScratch.Bytes = left.Bytes; leftScratch.Offset = left.Offset; leftScratch.Length = left.Length; rightScratch.Bytes = right.Bytes; rightScratch.Offset = right.Offset; rightScratch.Length = right.Length; long leftCost = outerInstance.Decode(leftScratch, input); long rightCost = outerInstance.Decode(rightScratch, input); if (outerInstance.HasPayloads) { outerInstance.DecodePayload(leftScratch, input); outerInstance.DecodePayload(rightScratch, input); } if (outerInstance.HasContexts) { outerInstance.DecodeContexts(leftScratch, input); outerInstance.DecodeContexts(rightScratch, input); } // LUCENENET NOTE: outerInstance.Comparer != outerInstance.comparer!! int cmp = outerInstance.comparer.Compare(leftScratch, rightScratch); if (cmp != 0) { return(cmp); } if (leftCost < rightCost) { return(-1); } else if (leftCost > rightCost) { return(1); } else { return(0); } }); this.hasPayloads = source.HasPayloads; this.hasContexts = source.HasContexts; this.source = source; this.comparer = comparer; this.reader = Sort(); }
private OfflineSorter.ByteSequencesReader Sort() { string prefix = this.GetType().Name; DirectoryInfo directory = OfflineSorter.DefaultTempDir(); tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); var writer = new OfflineSorter.ByteSequencesWriter(tempInput); bool success = false; try { BytesRef spare; byte[] buffer = new byte[0]; var output = new ByteArrayDataOutput(buffer); while ((spare = source.Next()) != null) { Encode(writer, output, buffer, spare, source.Payload, source.Contexts, source.Weight); } writer.Dispose(); (new OfflineSorter(tieBreakByCostComparer)).Sort(tempInput, tempSorted); var reader = new OfflineSorter.ByteSequencesReader(tempSorted); success = true; return(reader); } finally { if (success) { IOUtils.Dispose(writer); } else { try { IOUtils.DisposeWhileHandlingException(writer); } finally { Dispose(); } } } }
private OfflineSorter.ByteSequencesReader Sort() { string prefix = this.GetType().Name; DirectoryInfo directory = OfflineSorter.DefaultTempDir(); tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); var writer = new OfflineSorter.ByteSequencesWriter(tempInput); bool success = false; try { byte[] buffer = Arrays.Empty <byte>(); ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); while (source.MoveNext()) { Encode(writer, output, buffer, source.Current, source.Weight); } writer.Dispose(); (new OfflineSorter(tieBreakByCostComparer)).Sort(tempInput, tempSorted); OfflineSorter.ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(tempSorted); success = true; return(reader); } finally { if (success) { IOUtils.Dispose(writer); } else { try { IOUtils.DisposeWhileHandlingException(writer); } finally { Close(); } } } }
public override void Build(IInputEnumerator enumerator) { if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); hasPayloads = enumerator.HasPayloads; var writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; var scratch = new BytesRef(); TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton(); bool success = false; count = 0; byte[] buffer = new byte[8]; try { var output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while (enumerator.MoveNext()) { surfaceForm = enumerator.Current; ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a); maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); foreach (Int32sRef path in paths) { Util.Fst.Util.ToBytesRef(path, scratch); // length of the analyzed text (FST input) if (scratch.Length > ushort.MaxValue - 2) { throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) + " in length (got " + scratch.Length + ")"); } ushort analyzedLength = (ushort)scratch.Length; // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.Length > (ushort.MaxValue - 2)) { throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) + " in length (got " + surfaceForm.Length + ")"); } payload = enumerator.Payload; // payload + surfaceLength (short) requiredLength += payload.Length + 2; } else { payload = null; } buffer = ArrayUtil.Grow(buffer, requiredLength); output.Reset(buffer); output.WriteInt16((short)analyzedLength); output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); output.WriteInt32(EncodeWeight(enumerator.Weight)); if (hasPayloads) { for (int i = 0; i < surfaceForm.Length; i++) { if (surfaceForm.Bytes[i] == PAYLOAD_SEP) { throw new ArgumentException( "surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.WriteInt16((short)surfaceForm.Length); output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); } if (Debugging.AssertsEnabled) { Debugging.Assert(output.Position == requiredLength, () => output.Position + " vs " + requiredLength); } writer.Write(buffer, 0, output.Position); } count++; } writer.Dispose(); // Sort all input/output pairs (required by FST.Builder): (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted); // Free disk space: tempInput.Delete(); reader = new OfflineSorter.ByteSequencesReader(tempSorted); var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton, ByteSequenceOutputs.Singleton); var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); Int32sRef scratchInts = new Int32sRef(); var input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): var seenSurfaceForms = new JCG.HashSet <BytesRef>(); var dedup = 0; while (reader.Read(scratch)) { input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); ushort analyzedLength = (ushort)input.ReadInt16(); analyzed.Grow(analyzedLength + 2); input.ReadBytes(analyzed.Bytes, 0, analyzedLength); analyzed.Length = analyzedLength; long cost = input.ReadInt32(); surface.Bytes = scratch.Bytes; if (hasPayloads) { surface.Length = (ushort)input.ReadInt16(); surface.Offset = input.Position; } else { surface.Offset = input.Position; surface.Length = scratch.Length - surface.Offset; } if (previousAnalyzed == null) { previousAnalyzed = new BytesRef(); previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else if (analyzed.Equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.Contains(surface)) { continue; } seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Clear(); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup; analyzed.Length += 2; Util.Fst.Util.ToInt32sRef(analyzed, scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); } else { int payloadOffset = input.Position + surface.Length; int payloadLength = scratch.Length - payloadOffset; BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); br.Bytes[surface.Length] = PAYLOAD_SEP; Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); br.Length = br.Bytes.Length; builder.Add(scratchInts, outputs.NewPair(cost, br)); } } fst = builder.Finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer); } else { IOUtils.DisposeWhileHandlingException(reader, writer); } tempInput.Delete(); tempSorted.Delete(); } }
public ByteSequenceIterator(OfflineSorter.ByteSequencesReader reader, IComparer <BytesRef> comparer) { this.reader = reader; this.comparer = comparer; }
public override void Build(IInputIterator iterator) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir()); FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir()); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. bool success = false; count = 0; try { byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = iterator.Next()) != null) { if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteInt32(EncodeWeight(iterator.Weight)); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); writer.Write(buffer, 0, output.Position); } writer.Dispose(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted); tempInput.Delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); int inputLines = info.Lines; reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.Read(tmp1)) { input.Reset(tmp1.Bytes); int currentScore = input.ReadInt32(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int)(line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.Bytes = tmp1.Bytes; tmp2.Offset = input.Position; tmp2.Length = tmp1.Length - input.Position; builder.Add(tmp2, bucket); line++; count++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.Build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst); success = true; } finally { if (success) { IOUtils.Close(reader, writer, sorter); } else { IOUtils.CloseWhileHandlingException(reader, writer, sorter); } tempInput.Delete(); tempSorted.Delete(); } }
public override void Build(IInputIterator iterator) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir()); FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir()); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. bool success = false; count = 0; try { byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = iterator.Next()) != null) { if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteInt(EncodeWeight(iterator.Weight)); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); writer.Write(buffer, 0, output.Position); } writer.Dispose(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted); tempInput.Delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); int inputLines = info.Lines; reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.Read(tmp1)) { input.Reset(tmp1.Bytes); int currentScore = input.ReadInt(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int)(line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.Bytes = tmp1.Bytes; tmp2.Offset = input.Position; tmp2.Length = tmp1.Length - input.Position; builder.Add(tmp2, bucket); line++; count++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.Build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst); success = true; } finally { if (success) { IOUtils.Close(reader, writer, sorter); } else { IOUtils.CloseWhileHandlingException(reader, writer, sorter); } tempInput.Delete(); tempSorted.Delete(); } }
/// <summary> /// Creates a new sorted wrapper, sorting by BytesRef /// (ascending) then cost (ascending). /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public SortedTermFreqIteratorWrapper(org.apache.lucene.search.spell.TermFreqIterator source, java.util.Comparator<org.apache.lucene.util.BytesRef> comparator) throws java.io.IOException public SortedTermFreqIteratorWrapper(TermFreqIterator source, IComparer <BytesRef> comparator) { this.source = source; this.comparator = comparator; this.reader = Sort(); }
/// <summary> /// Reads the dictionary file through the provided InputStreams, building up the words map /// </summary> /// <param name="dictionaries"> InputStreams to read the dictionary file through </param> /// <param name="decoder"> CharsetDecoder used to decode the contents of the file </param> /// <exception cref="IOException"> Can be thrown while reading from the file </exception> private void ReadDictionaryFiles(IList <Stream> dictionaries, Encoding decoder, Builder <IntsRef> words) { BytesRef flagsScratch = new BytesRef(); IntsRef scratchInts = new IntsRef(); StringBuilder sb = new StringBuilder(); FileInfo unsorted = FileSupport.CreateTempFile("unsorted", "dat", tempDir); using (OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(unsorted)) { foreach (Stream dictionary in dictionaries) { var lines = new StreamReader(dictionary, decoder); string line = lines.ReadLine(); // first line is number of entries (approximately, sometimes) while ((line = lines.ReadLine()) != null) { line = UnescapeEntry(line); if (needsInputCleaning) { int flagSep = line.LastIndexOf(FLAG_SEPARATOR); if (flagSep == -1) { string cleansed = CleanInput(line, sb); writer.Write(cleansed.ToString().GetBytes(Encoding.UTF8)); } else { string text = line.Substring(0, flagSep - 0); string cleansed = CleanInput(text, sb); if (cleansed != sb.ToString()) { sb.Length = 0; sb.Append(cleansed); } sb.Append(line.Substring(flagSep)); writer.Write(sb.ToString().GetBytes(Encoding.UTF8)); } } else { writer.Write(line.GetBytes(Encoding.UTF8)); } } } } FileInfo sorted = FileSupport.CreateTempFile("sorted", "dat", tempDir); OfflineSorter sorter = new OfflineSorter(new ComparatorAnonymousInnerClassHelper(this)); sorter.Sort(unsorted, sorted); try { unsorted.Delete(); } catch { // ignore } using (OfflineSorter.ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(sorted)) { BytesRef scratchLine = new BytesRef(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently string currentEntry = null; IntsRef currentOrds = new IntsRef(); string line2; while (reader.Read(scratchLine)) { line2 = scratchLine.Utf8ToString(); string entry; char[] wordForm; int flagSep = line2.LastIndexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; entry = line2; } else { // note, there can be comments (morph description) after a flag. // we should really look for any whitespace: currently just tab and space int end = line2.IndexOf('\t', flagSep); if (end == -1) { end = line2.Length; } int end2 = line2.IndexOf(' ', flagSep); if (end2 == -1) { end2 = line2.Length; } end = Math.Min(end, end2); string flagPart = line2.Substring(flagSep + 1, end - (flagSep + 1)); if (aliasCount > 0) { flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture)); } wordForm = flagParsingStrategy.ParseFlags(flagPart); Array.Sort(wordForm); entry = line2.Substring(0, flagSep - 0); } // LUCENENET NOTE: CompareToOrdinal is an extension method that works similarly to // Java's String.compareTo method. int cmp = currentEntry == null ? 1 : entry.CompareToOrdinal(currentEntry); if (cmp < 0) { throw new System.ArgumentException("out of order: " + entry + " < " + currentEntry); } else { EncodeFlags(flagsScratch, wordForm); int ord = flagLookup.Add(flagsScratch); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; } // finalize current entry, and switch "current" if necessary if (cmp > 0 && currentEntry != null) { Lucene.Net.Util.Fst.Util.ToUTF32(currentEntry, scratchInts); words.Add(scratchInts, currentOrds); } // swap current if (cmp > 0 || currentEntry == null) { currentEntry = entry; currentOrds = new IntsRef(); // must be this way } currentOrds.Grow(currentOrds.Length + 1); currentOrds.Ints[currentOrds.Length++] = ord; } } // finalize last entry Lucene.Net.Util.Fst.Util.ToUTF32(currentEntry, scratchInts); words.Add(scratchInts, currentOrds); } try { sorted.Delete(); } catch { // ignore } }
public ByteSequenceIterator(OfflineSorter.ByteSequencesReader reader, IComparer<BytesRef> comparator) { this.reader = reader; this.comparator = comparator; }