예제 #1
0
        public static StringDistribution SubAverageConditional(StringDistribution str, int start, int minLength, int maxLength)
        {
            Argument.CheckIfNotNull(str, "str");
            Argument.CheckIfInRange(start >= 0, "start", "Start index must be non-negative.");
            Argument.CheckIfInRange(minLength >= 0, "minLength", "Min length must be non-negative.");
            Argument.CheckIfInRange(maxLength >= 0, "maxLength", "Max length must be non-negative.");

            if (str.IsPointMass)
            {
                var strPoint = str.Point;
                var alts     = new HashSet <string>();
                for (int length = minLength; length <= maxLength; length++)
                {
                    var s = strPoint.Substring(start, Math.Min(length, strPoint.Length));
                    alts.Add(s);
                }
                return(StringDistribution.OneOf(alts));
            }

            var anyChar    = StringAutomaton.ConstantOnElement(1.0, ImmutableDiscreteChar.Any());
            var transducer = StringTransducer.Consume(StringAutomaton.Repeat(anyChar, minTimes: start, maxTimes: start));

            transducer.AppendInPlace(StringTransducer.Copy(StringAutomaton.Repeat(anyChar, minTimes: minLength, maxTimes: maxLength)));
            transducer.AppendInPlace(StringTransducer.Consume(StringAutomaton.Constant(1.0)));

            return(StringDistribution.FromWeightFunction(transducer.ProjectSource(str.ToAutomaton())));
        }
예제 #2
0
        public void SampleGeometric()
        {
            Rand.Restart(96);

            const double StoppingProbability = 0.7;

            // The length of sequences sampled from this distribution must follow a geometric distribution
            StringAutomaton automaton = StringAutomaton.Zero();

            automaton.Start = automaton.AddState();
            automaton.Start.SetEndWeight(Weight.FromValue(StoppingProbability));
            automaton.Start.AddTransition('a', Weight.FromValue(1 - StoppingProbability), automaton.Start);
            StringDistribution dist = StringDistribution.FromWeightFunction(automaton);

            var       acc         = new MeanVarianceAccumulator();
            const int SampleCount = 30000;

            for (int i = 0; i < SampleCount; ++i)
            {
                string sample = dist.Sample();
                acc.Add(sample.Length);
            }

            const double ExpectedMean     = (1.0 - StoppingProbability) / StoppingProbability;
            const double ExpectedVariance = (1.0 - StoppingProbability) / (StoppingProbability * StoppingProbability);

            Assert.Equal(ExpectedMean, acc.Mean, 1e-2);
            Assert.Equal(ExpectedVariance, acc.Variance, 1e-2);
        }
예제 #3
0
        public void PointMassDetection()
        {
            StringDistribution s1     = StringDistribution.OneOf("hello", "world", "people");
            StringDistribution s2     = StringDistribution.OneOf("greetings", "people", "animals");
            StringDistribution point1 = s1.Product(s2);

            Assert.True(point1.IsPointMass);
            Assert.Equal("people", point1.Point);

            StringDistribution point2 = StringDistribution.OneOf(new Dictionary <string, double> {
                { "a", 3.0 }, { "b", 0.0 }
            });

            Assert.True(point2.IsPointMass);
            Assert.Equal("a", point2.Point);

            StringDistribution point3 = StringDistribution.CaseInvariant("123");

            Assert.True(point3.IsPointMass);
            Assert.Equal("123", point3.Point);

            StringDistribution point4 = StringDistribution.Char('Z');

            Assert.True(point4.IsPointMass);
            Assert.Equal("Z", point4.Point);

            StringDistribution point5 = StringDistribution.OneOf(1.0, StringDistribution.String("!"), 0.0, StringDistribution.Any());

            Assert.True(point5.IsPointMass);
            Assert.Equal("!", point5.Point);

            StringDistribution point6 = StringDistribution.Repeat('@', minTimes: 3, maxTimes: 3);

            Assert.True(point6.IsPointMass);
            Assert.Equal("@@@", point6.Point);

            StringDistribution point7 = StringDistribution.String("hello").Append(StringDistribution.String(" world"));

            Assert.True(point7.IsPointMass);
            Assert.Equal("hello world", point7.Point);

            string          point           = string.Empty;
            StringAutomaton point8Automaton = StringAutomaton.Empty();

            for (int i = 0; i < 22; ++i)
            {
                const string PointElement = "a";
                point8Automaton.AppendInPlace(StringAutomaton.ConstantOn(1.0, PointElement, PointElement));
                point += PointElement;
            }

            StringDistribution point8 = StringDistribution.FromWeightFunction(point8Automaton);

            Assert.True(point8.IsPointMass);
            Assert.Equal(point, point8.Point);
        }
예제 #4
0
        /// <include file='FactorDocs.xml' path='factor_docs/message_op_class[@name="StringConcatOp"]/message_doc[@name="Str2AverageConditional(StringDistribution, StringDistribution)"]/*'/>
        public static StringDistribution Str2AverageConditional(StringDistribution concat, StringDistribution str1)
        {
            Argument.CheckIfNotNull(concat, "concat");
            Argument.CheckIfNotNull(str1, "str1");

            StringTransducer transducer = StringTransducer.Consume(str1.ToAutomaton());

            transducer.AppendInPlace(StringTransducer.Copy());
            return(StringDistribution.FromWeightFunction(transducer.ProjectSource(concat.ToAutomaton())));
        }
예제 #5
0
        /// <include file='FactorDocs.xml' path='factor_docs/message_op_class[@name="StringOfLengthOp"]/message_doc[@name="StrAverageConditional(DiscreteChar, Discrete)"]/*'/>
        public static StringDistribution StrAverageConditional(DiscreteChar allowedChars, Discrete length)
        {
            Argument.CheckIfNotNull(length, "length");
            Argument.CheckIfValid(allowedChars.IsPartialUniform(), "allowedChars", "The set of allowed characters must be passed as a partial uniform distribution.");

            double logNormalizer  = allowedChars.GetLogAverageOf(allowedChars);
            var    oneCharacter   = StringAutomaton.ConstantOnElementLog(logNormalizer, allowedChars.WrappedDistribution);
            var    manyCharacters = StringAutomaton.Repeat(oneCharacter, length.GetWorkspace());

            return(StringDistribution.FromWeightFunction(manyCharacters));
        }
예제 #6
0
        /// <include file='FactorDocs.xml' path='factor_docs/message_op_class[@name="SubstringOp"]/message_doc[@name="StrAverageConditional(StringDistribution, int, int)"]/*'/>
        public static StringDistribution StrAverageConditional(StringDistribution sub, int start, int length)
        {
            Argument.CheckIfNotNull(sub, "sub");
            Argument.CheckIfInRange(start >= 0, "start", "Start index must be non-negative.");
            Argument.CheckIfInRange(length >= 0, "length", "Length must be non-negative.");

            var anyChar    = StringAutomaton.ConstantOnElement(1.0, ImmutableDiscreteChar.Any());
            var transducer = StringTransducer.Produce(StringAutomaton.Repeat(anyChar, minTimes: start, maxTimes: start));

            transducer.AppendInPlace(StringTransducer.Copy(StringAutomaton.Repeat(anyChar, minTimes: length, maxTimes: length)));
            transducer.AppendInPlace(StringTransducer.Produce(StringAutomaton.Constant(1.0)));

            return(StringDistribution.FromWeightFunction(transducer.ProjectSource(sub.ToAutomaton())));
        }
예제 #7
0
        public void ProductWithGroups()
        {
            StringDistribution lhsWithoutGroup = StringDistribution.String("ab");
            var weightFunction      = lhsWithoutGroup.GetWorkspaceOrPoint();
            var transitionWithGroup = weightFunction.Start.GetTransitions()[0];

            transitionWithGroup.Group = 1;
            weightFunction.Start.SetTransition(0, transitionWithGroup);
            StringDistribution lhs = StringDistribution.FromWeightFunction(weightFunction);
            StringDistribution rhs = StringDistribution.OneOf("ab", "ac");

            Assert.True(lhs.GetWorkspaceOrPoint().HasGroup(1));
            Assert.False(rhs.GetWorkspaceOrPoint().UsesGroups());
            var result = StringDistribution.Zero();

            result.SetToProduct(lhs, rhs);
            Assert.True(result.GetWorkspaceOrPoint().HasGroup(1));
        }
예제 #8
0
        public void Product3()
        {
            StringAutomaton weights1 = StringAutomaton.Sum(
                StringAutomaton.ConstantOn(1.0, "a"),
                StringAutomaton.ConstantOn(2.0, "b"),
                StringAutomaton.ConstantOn(4.0, "c"));
            StringAutomaton weights2 = StringAutomaton.Sum(
                StringAutomaton.ConstantOn(2.0, "a"),
                StringAutomaton.ConstantOn(5.0, "b"),
                StringAutomaton.ConstantOn(7.0, "c"));
            StringDistribution dist1   = StringDistribution.FromWeightFunction(weights1);
            StringDistribution dist2   = StringDistribution.FromWeightFunction(weights2);
            StringDistribution product = dist1.Product(dist2);

            StringInferenceTestUtilities.TestProbability(product, 2.0 / 40.0, "a");
            StringInferenceTestUtilities.TestProbability(product, 10.0 / 40.0, "b");
            StringInferenceTestUtilities.TestProbability(product, 28.0 / 40.0, "c");
        }
예제 #9
0
        public static StringDistribution FormatAverageConditional(StringDistribution str, IReadOnlyList <StringDistribution> args, IReadOnlyList <string> argNames)
        {
            Argument.CheckIfNotNull(str, "str");
            ValidateArguments(args, argNames);

            var allowedArgs = args.Select(arg => arg.ToAutomaton()).ToList();

            // Try optimizations for special cases
            if (TryOptimizedFormatAverageConditionalImpl(str, allowedArgs, argNames, out StringDistribution resultDist))
            {
                return(resultDist);
            }

            // Reverse the process defined by StrAverageConditional
            var             placeholderReplacer = GetPlaceholderReplacingTransducer(allowedArgs, argNames, true, false);
            StringAutomaton format = str.IsPointMass
                ? placeholderReplacer.ProjectSource(str.Point)
                : placeholderReplacer.ProjectSource(str.ToAutomaton());
            StringAutomaton validatedFormat = GetValidatedFormatString(format, argNames);

            return(StringDistribution.FromWeightFunction(validatedFormat));
        }
예제 #10
0
        public void ProductWithGroups()
        {
            StringDistribution lhsWithoutGroup = StringDistribution.String("ab");

            // add a group to first transition of the start state
            var weightFunctionBuilder = StringAutomaton.Builder.FromAutomaton(lhsWithoutGroup.GetWorkspaceOrPoint());
            var transitionIterator    = weightFunctionBuilder.Start.TransitionIterator;
            var transitionWithGroup   = transitionIterator.Value;

            transitionWithGroup.Group = 1;
            transitionIterator.Value  = transitionWithGroup;

            StringDistribution lhs = StringDistribution.FromWeightFunction(weightFunctionBuilder.GetAutomaton());
            StringDistribution rhs = StringDistribution.OneOf("ab", "ac");

            Assert.True(lhs.GetWorkspaceOrPoint().HasGroup(1));
            Assert.False(rhs.GetWorkspaceOrPoint().UsesGroups);
            var result = StringDistribution.Zero();

            result.SetToProduct(lhs, rhs);
            Assert.True(result.GetWorkspaceOrPoint().HasGroup(1));
        }
예제 #11
0
        /// <summary>
        /// The implementation of <see cref="StrAverageConditional(StringDistribution, IReadOnlyList{StringDistribution}, IReadOnlyList{string})"/>.
        /// </summary>
        /// <param name="format">The message from <c>format</c>.</param>
        /// <param name="allowedArgs">The message from <c>args</c>, truncated to allowed values and converted to automata.</param>
        /// <param name="argNames">The names of the arguments.</param>
        /// <param name="withGroups">Whether the result should mark different arguments with groups.</param>
        /// <param name="noValidation">Whether incorrect format string values should not be pruned.</param>
        /// <returns>The message to <c>str</c>.</returns>
        private static StringDistribution StrAverageConditionalImpl(
            StringDistribution format, IReadOnlyList <StringAutomaton> allowedArgs, IReadOnlyList <string> argNames, bool withGroups, bool noValidation)
        {
            StringDistribution resultDist = TryOptimizedStrAverageConditionalImpl(format, allowedArgs, argNames, withGroups);

            if (resultDist != null)
            {
                return(resultDist);
            }

            // Check braces for correctness.
            StringAutomaton validatedFormat = format.ToAutomaton();

            if (!noValidation)
            {
                validatedFormat = GetValidatedFormatString(format.ToAutomaton(), argNames);
            }

            // Now replace placeholders with arguments
            var             placeholderReplacer = GetPlaceholderReplacingTransducer(allowedArgs, argNames, false, withGroups);
            StringAutomaton str = placeholderReplacer.ProjectSource(validatedFormat);

            return(StringDistribution.FromWeightFunction(str));
        }
예제 #12
0
        /// <summary>
        /// An implementation of <see cref="StrAverageConditional(StringDistribution, IReadOnlyList{StringDistribution}, IReadOnlyList{string})"/>
        /// specialized for some cases for performance reasons.
        /// </summary>
        /// <param name="format">The message from <c>format</c>.</param>
        /// <param name="allowedArgs">The message from <c>args</c>, truncated to allowed values and converted to automata.</param>
        /// <param name="argNames">The names of the arguments.</param>
        /// <param name="withGroups">Whether the result should mark different arguments with groups.</param>
        /// <returns>
        /// Result distribution if there is an optimized implementation available for the provided parameters.
        /// <see langword="null"/> otherwise.
        /// </returns>
        /// <remarks>
        /// Supports the case of point mass <paramref name="format"/>.
        /// </remarks>
        private static StringDistribution TryOptimizedStrAverageConditionalImpl(
            StringDistribution format, IReadOnlyList <StringAutomaton> allowedArgs, IReadOnlyList <string> argNames, bool withGroups)
        {
            if (!format.IsPointMass)
            {
                // Fall back to the general case
                return(null);
            }

            // Check braces for correctness & replace placeholders with arguments simultaneously
            var result = StringAutomaton.Builder.ConstantOn(Weight.One, string.Empty);

            bool[] argumentSeen = new bool[allowedArgs.Count];
            int    openingBraceIndex = format.Point.IndexOf("{", StringComparison.Ordinal), closingBraceIndex = -1;

            while (openingBraceIndex != -1)
            {
                // Add the part of the format before the placeholder
                result.Append(StringAutomaton.ConstantOn(1.0, format.Point.Substring(closingBraceIndex + 1, openingBraceIndex - closingBraceIndex - 1)));

                // Find next opening and closing braces
                closingBraceIndex = format.Point.IndexOf("}", openingBraceIndex + 1, StringComparison.Ordinal);
                int nextOpeningBraceIndex = format.Point.IndexOf("{", openingBraceIndex + 1, StringComparison.Ordinal);

                // Opening brace must be followed by a closing brace
                if (closingBraceIndex == -1 || (nextOpeningBraceIndex != -1 && nextOpeningBraceIndex < closingBraceIndex))
                {
                    return(StringDistribution.Zero());
                }

                string argumentName  = format.Point.Substring(openingBraceIndex + 1, closingBraceIndex - openingBraceIndex - 1);
                int    argumentIndex = argNames.IndexOf(argumentName);

                // Unknown or previously seen argument found
                if (argumentIndex == -1 || argumentSeen[argumentIndex])
                {
                    return(StringDistribution.Zero());
                }

                // Replace the placeholder by the argument
                result.Append(allowedArgs[argumentIndex], withGroups ? argumentIndex + 1 : 0);

                // Mark the argument as 'seen'
                argumentSeen[argumentIndex] = true;

                openingBraceIndex = nextOpeningBraceIndex;
            }

            // There should be no closing braces after the last opening brace
            if (format.Point.IndexOf('}', closingBraceIndex + 1) != -1)
            {
                return(StringDistribution.Zero());
            }

            if (RequirePlaceholderForEveryArgument && argumentSeen.Any(seen => !seen))
            {
                // Some argument wasn't present although it was required
                return(StringDistribution.Zero());
            }

            // Append the part of the format after the last placeholder
            result.Append(StringAutomaton.ConstantOn(1.0, format.Point.Substring(closingBraceIndex + 1, format.Point.Length - closingBraceIndex - 1)));

            return(StringDistribution.FromWeightFunction(result.GetAutomaton()));
        }
예제 #13
0
        /// <summary>
        /// An implementation of <see cref="FormatAverageConditional(StringDistribution, IReadOnlyList{StringDistribution}, IReadOnlyList{string})"/>
        /// specialized for some cases for performance reasons.
        /// </summary>
        /// <param name="str">The message from <c>str</c>.</param>
        /// <param name="allowedArgs">The message from <c>args</c>, truncated to allowed values and converted to automata.</param>
        /// <param name="argNames">The names of the arguments.</param>
        /// <param name="resultDist">The computed result.</param>
        /// <returns>
        /// <see langword="true"/> if there is an optimized implementation available for the provided parameters,
        /// and <paramref name="resultDist"/> has been computed using it.
        /// <see langword="false"/> otherwise.
        /// </returns>
        /// <remarks>
        /// Supports the case of point mass <paramref name="str"/> and <paramref name="allowedArgs"/>,
        /// where each of the arguments is present in <paramref name="str"/> at most once and the occurrences
        /// are non-overlapping.
        /// </remarks>
        private static bool TryOptimizedFormatAverageConditionalImpl(
            StringDistribution str, IReadOnlyList <StringAutomaton> allowedArgs, IReadOnlyList <string> argNames, out StringDistribution resultDist)
        {
            resultDist = null;

            string[] allowedArgPoints = Util.ArrayInit(allowedArgs.Count, i => allowedArgs[i].TryComputePoint());
            if (!str.IsPointMass || !allowedArgPoints.All(argPoint => argPoint != null && SubstringOccurrencesCount(str.Point, argPoint) <= 1))
            {
                // Fall back to the general case
                return(false);
            }

            // Obtain arguments present in 'str' (ordered by position)
            var argPositions =
                allowedArgPoints.Select((arg, argIndex) => Tuple.Create(argIndex, str.Point.IndexOf(arg, StringComparison.Ordinal)))
                .Where(t => t.Item2 != -1)
                .OrderBy(t => t.Item2)
                .ToList();

            if (RequirePlaceholderForEveryArgument && argPositions.Count != allowedArgs.Count)
            {
                // Some argument is not in 'str'
                resultDist = StringDistribution.Zero();
                return(true);
            }

            StringAutomaton result            = StringAutomaton.ConstantOn(1.0, string.Empty);
            int             curArgumentIndex  = -1;
            int             curArgumentPos    = -1;
            int             curArgumentLength = 1;

            for (int i = 0; i < argPositions.Count; ++i)
            {
                int prevArgumentIndex  = curArgumentIndex;
                int prevArgumentPos    = curArgumentPos;
                int prevArgumentLength = curArgumentLength;
                curArgumentIndex  = argPositions[i].Item1;
                curArgumentPos    = argPositions[i].Item2;
                curArgumentLength = allowedArgPoints[curArgumentIndex].Length;

                if (prevArgumentIndex != -1 && curArgumentPos < prevArgumentPos + prevArgumentLength)
                {
                    // It's easier to fall back to the general case in case of overlapping arguments
                    return(false);
                }

                // Append the contents of 'str' preceeding the current argument
                result = result.Append(str.Point.Substring(prevArgumentPos + prevArgumentLength, curArgumentPos - prevArgumentPos - prevArgumentLength));

                // The format may have included either the text ot the placeholder
                string argName = "{" + argNames[curArgumentIndex] + "}";
                if (RequirePlaceholderForEveryArgument)
                {
                    result = result.Append(StringAutomaton.ConstantOn(1.0, argName));
                }
                else
                {
                    result = result.Append(StringAutomaton.ConstantOn(1.0, argName, allowedArgPoints[curArgumentIndex]));
                }
            }

            // Append the rest of 'str'
            result = result.Append(str.Point.Substring(curArgumentPos + curArgumentLength, str.Point.Length - curArgumentPos - curArgumentLength));

            resultDist = StringDistribution.FromWeightFunction(result);
            return(true);
        }