/// <summary> /// encodes an entry (bytes+(contexts)+(payload)+weight) to the provided writer /// </summary> protected internal virtual void Encode(OfflineSorter.ByteSequencesWriter writer, ByteArrayDataOutput output, sbyte[] buffer, BytesRef spare, BytesRef payload, HashSet <BytesRef> contexts, long weight) { int requiredLength = spare.Length + 8 + ((hasPayloads) ? 2 + payload.Length : 0); if (hasContexts) { foreach (BytesRef ctx in contexts) { requiredLength += 2 + ctx.Length; } requiredLength += 2; // for length of contexts } if (requiredLength >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, requiredLength); } output.Reset(buffer); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); if (hasContexts) { foreach (BytesRef ctx in contexts) { output.WriteBytes(ctx.Bytes, ctx.Offset, ctx.Length); output.WriteShort((short)ctx.Length); } output.WriteShort((short)contexts.Count); } if (hasPayloads) { output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); output.WriteShort((short)payload.Length); } output.WriteLong(weight); writer.Write(buffer, 0, output.Position); }
public override void Build(IInputIterator iterator) { if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); hasPayloads = iterator.HasPayloads; var writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; var scratch = new BytesRef(); TokenStreamToAutomaton ts2a = TokenStreamToAutomaton; bool success = false; count = 0; byte[] buffer = new byte[8]; try { var output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while ((surfaceForm = iterator.Next()) != null) { ISet <IntsRef> paths = ToFiniteStrings(surfaceForm, ts2a); maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); foreach (IntsRef path in paths) { Util.Fst.Util.ToBytesRef(path, scratch); // length of the analyzed text (FST input) if (scratch.Length > ushort.MaxValue - 2) { throw new System.ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) + " in length (got " + scratch.Length + ")"); } ushort analyzedLength = (ushort)scratch.Length; // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.Length > (ushort.MaxValue - 2)) { throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) + " in length (got " + surfaceForm.Length + ")"); } payload = iterator.Payload; // payload + surfaceLength (short) requiredLength += payload.Length + 2; } else { payload = null; } buffer = ArrayUtil.Grow(buffer, requiredLength); output.Reset(buffer); output.WriteShort((short)analyzedLength); output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); output.WriteInt(EncodeWeight(iterator.Weight)); if (hasPayloads) { for (int i = 0; i < surfaceForm.Length; i++) { if (surfaceForm.Bytes[i] == PAYLOAD_SEP) { throw new ArgumentException( "surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.WriteShort((short)surfaceForm.Length); output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); } Debug.Assert(output.Position == requiredLength, output.Position + " vs " + requiredLength); writer.Write(buffer, 0, output.Position); } count++; } writer.Dispose(); // Sort all input/output pairs (required by FST.Builder): (new OfflineSorter(new AnalyzingComparator(hasPayloads))).Sort(tempInput, tempSorted); // Free disk space: tempInput.Delete(); reader = new OfflineSorter.ByteSequencesReader(tempSorted); var outputs = new PairOutputs <long?, BytesRef>(PositiveIntOutputs.Singleton, ByteSequenceOutputs.Singleton); var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); IntsRef scratchInts = new IntsRef(); var input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): var seenSurfaceForms = new HashSet <BytesRef>(); var dedup = 0; while (reader.Read(scratch)) { input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); ushort analyzedLength = (ushort)input.ReadShort(); analyzed.Grow(analyzedLength + 2); input.ReadBytes(analyzed.Bytes, 0, analyzedLength); analyzed.Length = analyzedLength; long cost = input.ReadInt(); surface.Bytes = scratch.Bytes; if (hasPayloads) { surface.Length = (ushort)input.ReadShort(); surface.Offset = input.Position; } else { surface.Offset = input.Position; surface.Length = scratch.Length - surface.Offset; } if (previousAnalyzed == null) { previousAnalyzed = new BytesRef(); previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else if (analyzed.Equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.Contains(surface)) { continue; } seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Clear(); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup; analyzed.Length += 2; Util.Fst.Util.ToIntsRef(analyzed, scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); } else { int payloadOffset = input.Position + surface.Length; int payloadLength = scratch.Length - payloadOffset; BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); br.Bytes[surface.Length] = PAYLOAD_SEP; Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); br.Length = br.Bytes.Length; builder.Add(scratchInts, outputs.NewPair(cost, br)); } } fst = builder.Finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); success = true; } finally { if (success) { IOUtils.Close(reader, writer); } else { IOUtils.CloseWhileHandlingException(reader, writer); } tempInput.Delete(); tempSorted.Delete(); } }
/// <summary> /// Parses a specific affix rule putting the result into the provided affix map /// </summary> /// <param name="affixes"> Map where the result of the parsing will be put </param> /// <param name="header"> Header line of the affix rule </param> /// <param name="reader"> BufferedReader to read the content of the rule from </param> /// <param name="conditionPattern"> <seealso cref="String#format(String, Object...)"/> pattern to be used to generate the condition regex /// pattern </param> /// <param name="seenPatterns"> map from condition -> index of patterns, for deduplication. </param> /// <exception cref="IOException"> Can be thrown while reading the rule </exception> private void ParseAffix(SortedDictionary <string, IList <char?> > affixes, string header, TextReader reader, string conditionPattern, IDictionary <string, int?> seenPatterns, IDictionary <string, int?> seenStrips) { BytesRef scratch = new BytesRef(); StringBuilder sb = new StringBuilder(); string[] args = whitespacePattern.Split(header); bool crossProduct = args[2].Equals("Y"); int numLines = int.Parse(args[3], CultureInfo.InvariantCulture); affixData = ArrayUtil.Grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { Debug.Assert(affixWriter.Position == currentAffix << 3); string line = reader.ReadLine(); string[] ruleArgs = whitespacePattern.Split(line); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.Length < 4) { throw new Exception("The affix file contains a rule with less than four elements: " + line /*, reader.LineNumber */);// LUCENENET TODO: LineNumberReader } char flag = flagParsingStrategy.ParseFlag(ruleArgs[1]); string strip = ruleArgs[2].Equals("0") ? "" : ruleArgs[2]; string affixArg = ruleArgs[3]; char[] appendFlags = null; int flagSep = affixArg.LastIndexOf('/'); if (flagSep != -1) { string flagPart = affixArg.Substring(flagSep + 1); affixArg = affixArg.Substring(0, flagSep - 0); if (aliasCount > 0) { flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture)); } appendFlags = flagParsingStrategy.ParseFlags(flagPart); Array.Sort(appendFlags); twoStageAffix = true; } // TODO: add test and fix zero-affix handling! string condition = ruleArgs.Length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.StartsWith("[", StringComparison.Ordinal) && !condition.EndsWith("]", StringComparison.Ordinal)) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.IndexOf('-') >= 0) { condition = condition.Replace("-", "\\-"); } string regex; if (".".Equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.Equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = string.Format(CultureInfo.InvariantCulture, conditionPattern, condition); } // deduplicate patterns int?patternIndex = seenPatterns.ContainsKey(regex) ? seenPatterns[regex] : null; if (patternIndex == null) { patternIndex = patterns.Count; if (patternIndex > short.MaxValue) { throw new System.NotSupportedException("Too many patterns, please report this to [email protected]"); } seenPatterns[regex] = patternIndex; CharacterRunAutomaton pattern = new CharacterRunAutomaton((new RegExp(regex, RegExp.NONE)).ToAutomaton()); patterns.Add(pattern); } int?stripOrd = seenStrips.ContainsKey(strip) ? seenStrips[strip] : null; if (stripOrd == null) { stripOrd = seenStrips.Count; seenStrips[strip] = stripOrd; if (stripOrd > char.MaxValue) { throw new System.NotSupportedException("Too many unique strips, please report this to [email protected]"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } EncodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.Add(scratch); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > short.MaxValue) { // this limit is probably flexible, but its a good sanity check too throw new System.NotSupportedException("Too many unique append flags, please report this to [email protected]"); } affixWriter.WriteShort((short)flag); affixWriter.WriteShort((short)stripOrd); // encode crossProduct into patternIndex int patternOrd = (int)patternIndex << 1 | (crossProduct ? 1 : 0); affixWriter.WriteShort((short)patternOrd); affixWriter.WriteShort((short)appendFlagsOrd); if (needsInputCleaning) { string cleaned = CleanInput(affixArg, sb); affixArg = cleaned.ToString(); } IList <char?> list = affixes.ContainsKey(affixArg) ? affixes[affixArg] : null; if (list == null) { list = new List <char?>(); affixes[affixArg] = list; } list.Add((char)currentAffix); currentAffix++; } }