コード例 #1
0
            public MultiRepresentationWeightFunction <TDictionary> Repeat(int minTimes = 1, int?maxTimes = null)
            {
                Argument.CheckIfInRange(minTimes >= 0, nameof(minTimes), "The minimum number of repetitions must be non-negative.");
                Argument.CheckIfValid(!maxTimes.HasValue || maxTimes.Value >= minTimes, "The maximum number of repetitions must not be less than the minimum number.");

                if (weightFunction is PointMassWeightFunction pointMass && maxTimes.HasValue && maxTimes - minTimes < MaxDictionarySize)
                {
                    var newSequenceElements = new List <TElement>(SequenceManipulator.GetLength(pointMass.Point) * maxTimes.Value);
                    for (int i = 0; i < minTimes; ++i)
                    {
                        newSequenceElements.AddRange(pointMass.Point);
                    }
                    if (minTimes == maxTimes)
                    {
                        return(FromPoint(SequenceManipulator.ToSequence(newSequenceElements)));
                    }
                    else
                    {
                        Weight uniformWeight = Weight.FromValue(1.0 / (maxTimes.Value - minTimes));
                        Dictionary <TSequence, Weight> dict = new Dictionary <TSequence, Weight>(maxTimes.Value - minTimes + 1);
                        dict.Add(SequenceManipulator.ToSequence(newSequenceElements), uniformWeight);
                        for (int i = minTimes + 1; i <= maxTimes.Value; ++i)
                        {
                            newSequenceElements.AddRange(pointMass.Point);
                            dict.Add(SequenceManipulator.ToSequence(newSequenceElements), uniformWeight);
                        }
                        return(FromDictionary(DictionaryWeightFunction <TDictionary> .FromDistinctWeights(dict)));
                    }
                }
                if (weightFunction is TDictionary dictionary && maxTimes.HasValue)
                {
                    var resultSupportSize = ResultSupportSize(dictionary.Dictionary.Count, minTimes, maxTimes.Value);
                    if (resultSupportSize <= MaxDictionarySize)
                    {
                        return(FromDictionary(dictionary.Repeat(minTimes, maxTimes.Value, (int)resultSupportSize + 1)));
                    }
                }

                return(FromAutomaton(AsAutomaton().Repeat(minTimes, maxTimes)));

                double ResultSupportSize(int sourceSupportSize, int minReps, int maxReps)
                {
                    return(Math.Pow(sourceSupportSize, minReps) * (1 - Math.Pow(sourceSupportSize, maxReps - minReps + 1)) / (1 - sourceSupportSize));
                }
            }
コード例 #2
0
        /// <summary>
        /// Enumerate support of this automaton without elimination of duplicate elements
        /// </summary>
        /// <param name="maxTraversedPaths">
        /// Maximum number of paths in the automaton this function
        /// is allowed to traverse before stopping. Defaults to <see cref="int.MaxValue"/>.
        /// Can be used to limit the performance impact of this call in cases when
        /// the support is useful only if it can be obtained quickly.
        /// </param>
        /// <param name="stopOnNonPointMassElementDistribution">
        /// When set to true, the enumeration is canceled upon encountering a non-point mass
        /// element distribution on a transition and a <see langword="null"/> value is yielded.
        /// </param>
        /// <returns>
        /// The sequences supporting this automaton. Sequences may be non-distinct if
        /// automaton is not determinized. A <see langword="null"/> value in enumeration means that
        /// an infinite loop was reached or that the enumeration was stopped because
        /// a condition set by one of this method's parameters was met.
        /// Public <see cref="EnumerateSupport(int)"/> /
        /// <see cref="TryEnumerateSupport(int, out IEnumerable{TSequence}, int, bool)"/>
        /// methods handle null value differently.
        /// </returns>
        /// <remarks>
        /// Conceptually enumerating support is just depth-first traversal of automaton from start
        /// state recording the elements met on path. Real implementation is a little hairy because
        /// of the following reasons:
        /// - Recursion can not be used, instead an explicit stack is used
        /// - Each transition can have a distribution with a lot of support, so it has to be
        ///   enumerated lazily, making state stored on stack large
        /// - Some highly-branching paths can have no end state in them. This fact has to be
        ///   tracked to void spending exponential time traversing states which produce no output
        /// - Loops have to be tracked: some loops make automaton non-enumerable, some don't
        /// - An fast-path for non-branchy automata is implemented. It makes traversing those 10x
        ///   faster by skipping some boilerplate for tracking traversal state in these cases.
        /// </remarks>
        private IEnumerable <TSequence> EnumerateSupportInternalWithDuplicates(
            int maxTraversedPaths = int.MaxValue,
            bool stopOnNonPointMassElementDistribution = false)
        {
            // Sequence of elements on path to current state in automaton
            var sequence = new List <TElement>();

            // Stack of states for backtracking during depth-first traversal
            var stack = new Stack <StateEnumerationState>();

            // Stores 2 bits of data about state:
            // - Is this state a dead end. I.e. end state is not reachable through any path starting
            //   from this state. This flag is computed incrementally most of the time. But if
            //   automaton contains loops, a special procedure `ComputeEndStatesReachability` is
            //   invoked which computes it efficiently for whole graph.
            // - Whether this state is being visited now and if yes - how long is the sequence from
            //   root to this state. This is used for (non-empty) loop detection. We store length + 1,
            //   because 0 value of flags has special meaning of "this state has not been visited yet".
            var flags = new StateEnumerationFlags[this.States.Count];

            // Number of sequences produced by this time. By comparing value of this counter
            // before entering the state and when leaving it is easy to detect whether the state
            // is a dead end.
            var producedCount = 0;

            // Number of paths traversed by this time. Used in early break condition.
            var traversedPathsCount = 0;

            // Enumeration state for current state. A top of traversal stack, materialized in local
            // variable for convenience.
            var current = InitEnumerationState(this.Start.Index, 0, this.Start.CanEnd ? 0 : -1);

            // If true, we can assume that `IsDeadEnd` flag is computed for all states
            var endStateReachabilityComputed = false;

            // Mark the first state as visited
            flags[this.Start.Index] = (StateEnumerationFlags)1;

            if (this.Start.CanEnd)
            {
                // We do not ever "enter" the start state via a transition, so need to handle
                // "start state is also an end state" case explicitly
                yield return(SequenceManipulator.ToSequence(sequence));
            }

            while (true)
            {
                if (!TryBacktrackIfNeed())
                {
                    // Nowhere to backtrack, enumerated everything
                    yield break;
                }

                // In theory, this operation could be called only when `current.PathLength`
                // decreases. But that happens in multiple places, so it's easier to truncate path
                // on each iteration, even if it is a noop.
                sequence.RemoveRange(current.PathLength, sequence.Count - current.PathLength);

                var transitionAdvancementResult = AdvanceToNextTransition(out var nextStateIndex, out var nextElement);
                if (transitionAdvancementResult == TransitionAdvancementResult.NoMoreTransitions)
                {
                    // Failed to go to next transition: destination state is unreachable
                    continue;
                }
                if (transitionAdvancementResult == TransitionAdvancementResult.ShouldStopEnumeration)
                {
                    yield return(null);

                    yield break;
                }

                if (nextElement.HasValue)
                {
                    sequence.Add(nextElement.Value);
                }

                nextStateIndex = TraverseTrivialStates(nextStateIndex);
                var nextState = this.Data.States[nextStateIndex];

                if (flags[nextStateIndex] != 0)
                {
                    ++traversedPathsCount;
                    if (traversedPathsCount > maxTraversedPaths)
                    {
                        // Hit the limit on the number of returned sequences.
                        yield return(null);

                        yield break;
                    }

                    // This states is either dead end or a loop. If this is a dead end, then it
                    // will not be traversed at all.
                    if ((flags[nextStateIndex] & StateEnumerationFlags.IsDeadEnd) != 0)
                    {
                        continue;
                    }

                    // This is a loop, so let's check if it can produce anything.
                    // If it produces anything - then support is in-enumerable.
                    // If it doesn't, there's no point in entering this state at all
                    if (!endStateReachabilityComputed)
                    {
                        ComputeEndStateReachability(flags);
                        endStateReachabilityComputed = true;
                    }

                    if ((flags[nextStateIndex] & StateEnumerationFlags.IsDeadEnd) != 0 ||
                        sequence.Count + 1 == (int)(flags[nextStateIndex] & StateEnumerationFlags.DepthMask))
                    {
                        // Just skip this loop
                        // - Either no end state is reachable from this loop. So it can "produce"
                        //   as many elements as it wants, this does not matter because this
                        //   sequence would never be output.
                        // - Or this loop produces 0 elements (i.e. consists of epsilon transitions
                        //   only), because path was extended by 0 elements after traversing the loop.
                        continue;
                    }

                    // A non-empty loop that can produce something was found. Signal to outer call
                    // via returning null.
                    yield return(null);

                    yield break;
                }

                var producedBeforeThisState = producedCount;
                if (nextState.CanEnd && sequence.Count > current.LongestOutputInPath)
                {
                    // Output sequence only if it is longer then any sequence produced on the path
                    // to this state. This is an important optimization in some real automata.
                    // Simplified example: graph `a -> b -> c -> d -> e`. Where both states `a` and
                    // `e` are marked as end states and every arrow corresponds to 2 epsilon
                    // transitions. There are 2^4 = 16 unique paths from `a` to `e`. We need to
                    // detect that `e` is practically a dead end even though it has `CanEnd` flag.
                    // By noticing that `e` and `a` states produce sequences of the same length,
                    // we can assume that `e` is effectively not an end state because it does not
                    // produce any new sequences.
                    ++producedCount;
                    ++traversedPathsCount;
                    if (traversedPathsCount > maxTraversedPaths)
                    {
                        // Hit the limit on the number of returned sequences.
                        yield return(null);

                        yield break;
                    }
                    yield return(SequenceManipulator.ToSequence(sequence));
                }

                if (nextState.TransitionsCount == 0)
                {
                    // Fast path: there is no need to enter this state at all because we would
                    // backtrack immediately to current state on next iteration.
                    if (!nextState.CanEnd)
                    {
                        flags[nextStateIndex] = StateEnumerationFlags.IsDeadEnd;
                    }
                }
                else
                {
                    // Slow path: store current state on stack and move to next state
                    flags[nextStateIndex] = (StateEnumerationFlags)(sequence.Count + 1);
                    stack.Push(current);
                    current = InitEnumerationState(
                        nextStateIndex,
                        producedBeforeThisState,
                        nextState.CanEnd ? sequence.Count : current.LongestOutputInPath);
                }
            }

            StateEnumerationState InitEnumerationState(
                int index, int producedBeforeThisState, int longestOutputInPath)
            {
                var state = this.Data.States[index];

                return(new StateEnumerationState
                {
                    StateIndex = index,
                    ProducedCount = producedBeforeThisState,
                    PathLength = sequence.Count,
                    LongestOutputInPath = longestOutputInPath,
                    TransitionIndex = state.FirstTransitionIndex - 1,
                    RemainingTransitionsCount = state.TransitionsCount,
                    ElementEnumerator = null,
                });
            }

            // Backtracks according to traversal stack if that is needed.
            // Returns false if automaton has been fully enumerated.
            //
            // If backtracking is needed (i.e. no more non-processed transitions left in current
            // state), replaces `current` with the current top of the stack if that's possible.
            // Also updates the `flags` with information whether path through this state produces
            // any output.
            bool TryBacktrackIfNeed()
            {
                // While no more transitions are left in current state - backtrack
                while (current.ElementEnumerator == null && current.RemainingTransitionsCount == 0)
                {
                    if (stack.Count == 0)
                    {
                        // Nowhere to backtrack, this automaton has been fully enumerated
                        if (this.Data.IsEnumerable == null)
                        {
                            this.Data = this.Data.With(isEnumerable: true);
                        }

                        return(false);
                    }

                    // If upon leaving current state, number of produced sequences is equal to
                    // the number upon entrance then this is a dead end.
                    flags[current.StateIndex] = current.ProducedCount == producedCount
                        ? StateEnumerationFlags.IsDeadEnd
                        : 0;

                    current = stack.Pop();
                }

                return(true);
            }

            // Updates `current` enumeration state by moving to next element/state index reachable
            // from current state.
            TransitionAdvancementResult AdvanceToNextTransition(out int nextStateIndex, out Option <TElement> nextElement)
            {
                Debug.Assert(
                    current.ElementEnumerator != null || current.RemainingTransitionsCount != 0,
                    "TryBacktrack() must skip all states with no transitions left");

                if (current.ElementEnumerator != null)
                {
                    // Advance to next element in current transition
                    nextStateIndex = this.Data.Transitions[current.TransitionIndex].DestinationStateIndex;
                    if ((flags[nextStateIndex] & StateEnumerationFlags.IsDeadEnd) != 0)
                    {
                        // While enumerating current transition we learned that it leads to a
                        // dead end. Stop enumerating elements on this transition, because will
                        // waste time on enumerating dead end.
                        current.ElementEnumerator = null;
                    }
                    else
                    {
                        nextElement = current.ElementEnumerator.Current;
                        if (!current.ElementEnumerator.MoveNext())
                        {
                            // Element done, move to next transition on next iteration
                            current.ElementEnumerator = null;
                        }

                        return(TransitionAdvancementResult.Success);
                    }
                }

                // Advance to next transition
                while (current.RemainingTransitionsCount != 0)
                {
                    ++current.TransitionIndex;
                    --current.RemainingTransitionsCount;

                    var transition = this.Data.Transitions[current.TransitionIndex];
                    nextStateIndex = transition.DestinationStateIndex;

                    if (transition.Weight.IsZero ||
                        (flags[nextStateIndex] & StateEnumerationFlags.IsDeadEnd) != 0)
                    {
                        // Do not follow paths which produce nothing and try next transition
                        continue;
                    }

                    if (transition.IsEpsilon)
                    {
                        nextElement = Option.None;
                    }
                    else
                    {
                        var elementDistribution = transition.ElementDistribution.Value;
                        if (elementDistribution.IsPointMass)
                        {
                            nextElement = elementDistribution.Point;
                        }
                        else
                        {
                            if (stopOnNonPointMassElementDistribution)
                            {
                                nextStateIndex = -1;
                                nextElement    = Option.None;
                                return(TransitionAdvancementResult.ShouldStopEnumeration);
                            }
                            if (!(elementDistribution is CanEnumerateSupport <TElement> supportEnumerator))
                            {
                                this.Data = this.Data.With(isEnumerable: false);
                                throw new InvalidOperationException();
                            }

                            var enumerator = supportEnumerator.EnumerateSupport().GetEnumerator();
                            if (!enumerator.MoveNext())
                            {
                                // This transition is not marked as epsilon (i.e. contains
                                // distribution on it). But this distribution contains no support.
                                // Go to next transition.
                                continue;
                            }

                            nextElement = enumerator.Current;
                            current.ElementEnumerator = enumerator.MoveNext() ? enumerator : null;
                        }
                    }

                    return(TransitionAdvancementResult.Success);
                }

                nextStateIndex = -1;
                nextElement    = Option.None;
                return(TransitionAdvancementResult.NoMoreTransitions);
            }

            // Traverses all trivial states - non-terminal states with only one non-epsilon forward
            // transition with non-zero weight.
            //
            // It is safe (and a lot faster) to traverse these states without involving
            // backtracking logic or complicated code for enumerating transitions. The only tricky
            // part is loop detection - because these states are not backtracked, flags on the
            // won't be updated. But that is safe because loop involves at least 1 backward
            // transition, and that one won't be fast-tracked.
            //
            // This procedure is fast-path optimization for automata with very low uncertainty.
            // Most automata in practice belong to this category.
            int TraverseTrivialStates(int currentStateIndex)
            {
                while (true)
                {
                    var state = this.Data.States[currentStateIndex];

                    if (flags[currentStateIndex] != 0 ||
                        state.CanEnd ||
                        state.TransitionsCount != 1)
                    {
                        return(currentStateIndex);
                    }

                    var transition = this.Data.Transitions[state.FirstTransitionIndex];

                    if (transition.Weight.IsZero ||
                        transition.IsEpsilon ||
                        transition.DestinationStateIndex <= currentStateIndex)
                    {
                        return(currentStateIndex);
                    }

                    var dist = transition.ElementDistribution.Value;

                    if (!dist.IsPointMass)
                    {
                        return(currentStateIndex);
                    }

                    sequence.Add(dist.Point);
                    currentStateIndex = transition.DestinationStateIndex;
                }
            }
        }
コード例 #3
0
            public MultiRepresentationWeightFunction <TDictionary> NormalizeStructure()
            {
                switch (weightFunction)
                {
                case TDictionary dictionary:
                    var filteredTruncated = dictionary.Dictionary.Where(kvp => !kvp.Value.IsZero).Take(2).ToList();
                    if (filteredTruncated.Count == 0)
                    {
                        return(Zero());
                    }
                    else if (filteredTruncated.Count == 1)
                    {
                        return(FromPoint(filteredTruncated.Single().Key));
                    }
                    else
                    {
                        return(FromDictionary(dictionary.NormalizeStructure()));
                    }

                case TAutomaton automaton:
                    if (!automaton.UsesGroups)
                    {
                        if (automaton.LogValueOverride == null && automaton.TryEnumerateSupport(MaxDictionarySize, out var support, false, 4 * MaxDictionarySize, true))
                        {
                            var list = support.ToList();
                            if (list.Count == 0)
                            {
                                return(Zero());
                            }
                            else if (list.Count == 1)
                            {
                                return(FromPoint(list[0]));
                            }
                            else
                            {
                                // Create a dictionary only if we expect it to be smaller than the automaton.
                                // Approximation uses sizes corresponding to a string automaton, which is the most used one.
                                // We don't require this comparison to be always precise - most of the times is good enough.
                                var dictSizeApprox     = list.Sum(el => SequenceManipulator.GetLength(el)) * sizeof(char) + (24 + 8 + sizeof(double)) * list.Count;
                                var automatonSizeAprox =
                                    24                                                                 // header
                                    + 16 + 2 * sizeof(double)                                          // 2 double? fields
                                                                                                       // Data Container
                                    + 2 * sizeof(int)                                                  // Flags and StartStateIndex
                                    + 2 * 24                                                           // Headers of the states and transitions arrays
                                    + automaton.Data.States.Count * (2 * sizeof(int) + sizeof(double)) // states
                                    + automaton.Data.Transitions.Count * 24                            // 24 is the size of one transition w/o storage for discrete char
                                    + automaton.Data.Transitions.Count(tr => !tr.IsEpsilon) * 80;
                                // 40 is the size of a DiscreteChar filled with nulls;
                                // another 40 is the size of an array with a single char range.
                                // Any specific DiscreteChar can be larger or can be cached.
                                // 40 seems an ok approximation for the average case.
                                if (dictSizeApprox < automatonSizeAprox)
                                {
                                    return(FromDictionary(
                                               DictionaryWeightFunction <TDictionary> .FromDistinctWeights(
                                                   list.Select(seq => new KeyValuePair <TSequence, Weight>(seq, Weight.FromLogValue(automaton.GetLogValue(seq)))))));
                                }
                            }
                        }
                        // TryEnumerateSupport(..., maxTraversedPaths, ...) is allowed to quit early
                        // on complex automata, so we need to explicitly check for point mass
                        var point = automaton.TryComputePoint();
                        if (point != null)
                        {
                            return(FromPoint(point));
                        }
                    }
                    break;
                }

                return(Clone()); // TODO: replace with `this` after making automata immutable
            }