/// <summary> /// Creates a uniform distribution over strings that start with an upper case letter followed by /// one or more letters, with length within the given bounds. /// If <paramref name="maxLength"/> is set to <see langword="null"/>, /// there will be no upper bound on the length, and the resulting distribution will thus be improper. /// </summary> /// <param name="minLength">The minimum possible string length. Defaults to 2.</param> /// <param name="maxLength"> /// The maximum possible sequence length, or <see langword="null"/> for no upper bound on length. /// Defaults to <see langword="null"/>. /// </param> /// <param name="allowUpperAfterFirst">Whether to allow upper case letters after the initial upper case letter. If false, only lower case letters will be allowed.</param> /// <returns>The created distribution.</returns> public static StringDistribution Capitalized(int minLength = 2, int?maxLength = null, bool allowUpperAfterFirst = false) { Argument.CheckIfInRange(minLength >= 1, "minLength", "The minimum length of a capitalized string should be 1 or more."); Argument.CheckIfValid(!maxLength.HasValue || maxLength.Value >= minLength, "The maximum length cannot be less than the minimum length."); var result = StringDistribution.Char(ImmutableDiscreteChar.Upper()); if (maxLength.HasValue) { result.AppendInPlace( allowUpperAfterFirst ? StringDistribution.Letters(minLength: minLength - 1, maxLength: maxLength.Value - 1) : StringDistribution.Lower(minLength: minLength - 1, maxLength: maxLength.Value - 1)); } else { // Concatenation with an improper distribution, need to adjust its scale so that the result is 1 on its support double logNormalizer = result.GetLogAverageOf(result); var lowercaseSuffixFunc = (allowUpperAfterFirst ? StringDistribution.Letters(minLength: minLength - 1) : StringDistribution.Lower(minLength: minLength - 1)).ToNormalizedAutomaton(); var lowercaseSuffixFuncScaled = lowercaseSuffixFunc.ScaleLog(-logNormalizer); result.AppendInPlace(StringDistribution.FromWeightFunction(lowercaseSuffixFuncScaled)); } return(result); }
public static StringDistribution EmptyOrStartsWith(ImmutableDiscreteChar charsInMainString, ImmutableDiscreteChar startsWith) { // TODO: fix equality and then use factory methods to create this var result = new StringAutomaton.Builder(); result.Start.SetEndWeight(Weight.One); var otherState = result.Start.AddTransition(startsWith, Weight.FromLogValue(-startsWith.GetLogAverageOf(startsWith))); otherState.AddSelfTransition(charsInMainString, Weight.FromLogValue(-charsInMainString.GetLogAverageOf(charsInMainString))); otherState.SetEndWeight(Weight.One); return(StringDistribution.FromWeightFunction(result.GetAutomaton())); }
/// <summary> /// Creates a uniform distribution over any string starting and ending with a non-word character. /// Characters other than the first and the last are restricted to be non-zero probability characters /// from a given distribution. /// </summary> /// <param name="allowedChars">The distribution representing allowed characters.</param> /// <param name="nonWordCharacter">The word separating characters.</param> /// <returns>The created distribution.</returns> public static StringDistribution WordMiddle(ImmutableDiscreteChar allowedChars, ImmutableDiscreteChar?nonWordCharacter = null) { // TODO: fix equality and then use factory methods to create this nonWordCharacter = nonWordCharacter ?? NonWordCharacter; var result = new StringAutomaton.Builder(); var otherState1 = result.Start.AddTransition( Option.FromNullable(nonWordCharacter), Weight.FromLogValue(-nonWordCharacter.Value.GetLogAverageOf(nonWordCharacter.Value))); otherState1.SetEndWeight(Weight.One); var otherState2 = otherState1.AddEpsilonTransition(Weight.One) .AddSelfTransition(allowedChars, Weight.FromLogValue(-allowedChars.GetLogAverageOf(allowedChars))).AddTransition( Option.FromNullable(nonWordCharacter), Weight.FromLogValue(-nonWordCharacter.Value.GetLogAverageOf(nonWordCharacter.Value))); otherState2.SetEndWeight(Weight.One); return(StringDistribution.FromWeightFunction(result.GetAutomaton())); }