Ejemplo n.º 1
0
        /// <include file='FactorDocs.xml' path='factor_docs/message_op_class[@name="SingleOp"]/message_doc[@name="CharacterAverageConditional(StringDistribution)"]/*'/>
        public static DiscreteChar CharacterAverageConditional(StringDistribution str)
        {
            Argument.CheckIfNotNull(str, "str");

            if (str.IsPointMass)
            {
                return(CharacterAverageConditional(str.Point));
            }

            Vector          resultLogProb = PiecewiseVector.Constant(char.MaxValue + 1, double.NegativeInfinity);
            StringAutomaton probFunc      = str.GetWorkspaceOrPoint();

            StringAutomaton.EpsilonClosure startEpsilonClosure = new Automaton <string, char, DiscreteChar, StringManipulator, StringAutomaton> .EpsilonClosure(probFunc, probFunc.Start);

            for (int stateIndex = 0; stateIndex < startEpsilonClosure.Size; ++stateIndex)
            {
                StringAutomaton.State state = startEpsilonClosure.GetStateByIndex(stateIndex);
                Weight stateLogWeight       = startEpsilonClosure.GetStateWeightByIndex(stateIndex);
                foreach (var transition in state.Transitions)
                {
                    if (!transition.IsEpsilon)
                    {
                        StringAutomaton.State          destState        = probFunc.States[transition.DestinationStateIndex];
                        StringAutomaton.EpsilonClosure destStateClosure = new Automaton <string, char, DiscreteChar, StringManipulator, StringAutomaton> .EpsilonClosure(probFunc, destState);

                        if (!destStateClosure.EndWeight.IsZero)
                        {
                            Weight weight   = Weight.Product(stateLogWeight, transition.Weight, destStateClosure.EndWeight);
                            var    logProbs = transition.ElementDistribution.Value.GetProbs();
                            logProbs.SetToFunction(logProbs, Math.Log);
                            resultLogProb = LogSumExp(resultLogProb, logProbs, weight);
                        }
                    }
                }
            }

            if (resultLogProb.All(double.IsNegativeInfinity))
            {
                throw new AllZeroException("An input distribution assigns zero probability to all single character strings.");
            }

            Vector resultProb    = PiecewiseVector.Zero(char.MaxValue + 1);
            double logNormalizer = resultLogProb.LogSumExp();

            resultProb.SetToFunction(resultLogProb, lp => Math.Exp(lp - logNormalizer));
            return(DiscreteChar.FromVector(resultProb));
        }
Ejemplo n.º 2
0
        /// <summary>EP message to <c>character</c>.</summary>
        /// <param name="str">Incoming message from <c>str</c>.</param>
        /// <returns>The outgoing EP message to the <c>character</c> argument.</returns>
        /// <remarks>
        ///   <para>The outgoing message is a distribution matching the moments of <c>character</c> as the random arguments are varied. The formula is <c>proj[p(character) sum_(str) p(str) factor(character,str)]/p(character)</c>.</para>
        /// </remarks>
        public static DiscreteChar CharacterAverageConditional(StringDistribution str)
        {
            Argument.CheckIfNotNull(str, "str");

            Vector          resultlogProb = PiecewiseVector.Constant(char.MaxValue + 1, double.NegativeInfinity);
            StringAutomaton probFunc      = str.GetProbabilityFunction();

            StringAutomaton.EpsilonClosure startEpsilonClosure = probFunc.Start.GetEpsilonClosure();
            for (int stateIndex = 0; stateIndex < startEpsilonClosure.Size; ++stateIndex)
            {
                StringAutomaton.State state = startEpsilonClosure.GetStateByIndex(stateIndex);
                double stateLogWeight       = startEpsilonClosure.GetStateLogWeightByIndex(stateIndex);
                for (int transitionIndex = 0; transitionIndex < state.Transitions.Count; ++transitionIndex)
                {
                    StringAutomaton.Transition transition = state.Transitions[transitionIndex];
                    if (!transition.IsEpsilon)
                    {
                        StringAutomaton.State          destState        = probFunc.States[transition.DestinationStateIndex];
                        StringAutomaton.EpsilonClosure destStateClosure = destState.GetEpsilonClosure();
                        if (!double.IsNegativeInfinity(destStateClosure.EndLogWeight))
                        {
                            double logWeight = stateLogWeight + transition.LogWeight + destStateClosure.EndLogWeight;
                            resultlogProb = LogSumExp(resultlogProb, transition.ElementDistribution.GetInternalDiscrete().GetLogProbs(), logWeight);
                        }
                    }
                }
            }

            if (resultlogProb.All(double.IsNegativeInfinity))
            {
                throw new AllZeroException("An input distribution assigns zero probability to all single character strings.");
            }

            Vector resultProb = PiecewiseVector.Zero(char.MaxValue + 1);

            resultProb.SetToFunction(resultlogProb, Math.Exp);
            return(DiscreteChar.FromVector(resultProb));
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Creates a distribution which is uniform over all characters
        /// that have zero probability under this distribution
        /// i.e. that are not 'in' this distribution.
        /// </summary>
        /// <remarks>
        /// This is useful for defining characters that are not in a particular distribution
        /// e.g. not a letter or not a word character.
        /// </remarks>
        /// <returns>The created distribution.</returns>
        public DiscreteChar Complement()
        {
            // This creates a vector whose common value is not zero,
            // but where the piece values are zero.  This is useful when
            // displaying the distribution (to show that it is a 'complement')
            // but may have unforeseen side effects e.g. on performance.
            // todo: consider revisiting this design.
            PiecewiseVector res;

            if (this.IsPointMass)
            {
                res             = PiecewiseVector.Constant(this.Dimension, 1.0);
                res[this.Point] = 0;
            }
            else
            {
                res = PiecewiseVector.Zero(this.Dimension);
                res.SetToFunction(this.disc.GetWorkspace(), x => x == 0.0 ? 1.0 : 0.0);
            }

            var comp = DiscreteChar.FromVector(res);

            return(comp);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// The entry point of the motif finder.
        /// </summary>
        public static void Main()
        {
            Rand.Restart(1337);

            const int    SequenceCount            = 50;
            const int    SequenceLength           = 25;
            const double MotifPresenceProbability = 0.8;

            //// Sample some data

            var trueMotifNucleobaseDist = new[]
            {
                NucleobaseDist(a: 0.8, c: 0.1, g: 0.05, t: 0.05),
                NucleobaseDist(a: 0.0, c: 0.9, g: 0.05, t: 0.05),
                NucleobaseDist(a: 0.0, c: 0.0, g: 0.5, t: 0.5),
                NucleobaseDist(a: 0.25, c: 0.25, g: 0.25, t: 0.25),
                NucleobaseDist(a: 0.1, c: 0.1, g: 0.1, t: 0.7),
                NucleobaseDist(a: 0.0, c: 0.0, g: 0.9, t: 0.1),
                NucleobaseDist(a: 0.9, c: 0.05, g: 0.0, t: 0.05),
                NucleobaseDist(a: 0.5, c: 0.5, g: 0.0, t: 0.0),
            };

            int motifLength = trueMotifNucleobaseDist.Length;
            var backgroundNucleobaseDist = NucleobaseDist(a: 0.25, c: 0.25, g: 0.25, t: 0.25);

            string[] sequenceData;
            int[]    motifPositionData;
            SampleMotifData(
                SequenceCount,
                SequenceLength,
                MotifPresenceProbability,
                trueMotifNucleobaseDist,
                backgroundNucleobaseDist,
                out sequenceData,
                out motifPositionData);

            //// Specify the model

            Vector motifNucleobasePseudoCounts = PiecewiseVector.Constant(char.MaxValue + 1, 1e-6);

            motifNucleobasePseudoCounts['A'] = motifNucleobasePseudoCounts['C'] = motifNucleobasePseudoCounts['G'] = motifNucleobasePseudoCounts['T'] = 2.0;

            Range motifCharsRange = new Range(motifLength);
            VariableArray <Vector> motifNucleobaseProbs = Variable.Array <Vector>(motifCharsRange);

            motifNucleobaseProbs[motifCharsRange] = Variable.Dirichlet(motifNucleobasePseudoCounts).ForEach(motifCharsRange);

            var sequenceRange = new Range(SequenceCount);
            VariableArray <string> sequences = Variable.Array <string>(sequenceRange);

            VariableArray <int> motifPositions = Variable.Array <int>(sequenceRange);

            motifPositions[sequenceRange] = Variable.DiscreteUniform(SequenceLength - motifLength + 1).ForEach(sequenceRange);

            VariableArray <bool> motifPresence = Variable.Array <bool>(sequenceRange);

            motifPresence[sequenceRange] = Variable.Bernoulli(MotifPresenceProbability).ForEach(sequenceRange);

            using (Variable.ForEach(sequenceRange))
            {
                using (Variable.If(motifPresence[sequenceRange]))
                {
                    var motifChars = Variable.Array <char>(motifCharsRange);
                    motifChars[motifCharsRange] = Variable.Char(motifNucleobaseProbs[motifCharsRange]);
                    var motif = Variable.StringFromArray(motifChars);

                    var backgroundLengthRight = SequenceLength - motifLength - motifPositions[sequenceRange];
                    var backgroundLeft        = Variable.StringOfLength(motifPositions[sequenceRange], backgroundNucleobaseDist);
                    var backgroundRight       = Variable.StringOfLength(backgroundLengthRight, backgroundNucleobaseDist);

                    sequences[sequenceRange] = backgroundLeft + motif + backgroundRight;
                }

                using (Variable.IfNot(motifPresence[sequenceRange]))
                {
                    sequences[sequenceRange] = Variable.StringOfLength(SequenceLength, backgroundNucleobaseDist);
                }
            }

            //// Infer the motif from sampled data

            sequences.ObservedValue = sequenceData;

            var engine = new InferenceEngine();

            engine.Algorithm                   = new ExpectationPropagation();
            engine.NumberOfIterations          = 30;
            engine.Compiler.RecommendedQuality = QualityBand.Experimental;

            var motifNucleobaseProbsPosterior = engine.Infer <IList <Dirichlet> >(motifNucleobaseProbs);
            var motifPresencePosterior        = engine.Infer <IList <Bernoulli> >(motifPresence);
            var motifPositionPosterior        = engine.Infer <IList <Discrete> >(motifPositions);

            //// Output inference results

            PrintMotifInferenceResults(
                sequenceData,
                motifPositionData,
                trueMotifNucleobaseDist,
                motifNucleobaseProbsPosterior,
                motifPresencePosterior,
                motifPositionPosterior);

            //// Keep the application alive until the user enters a keystroke

            Console.ReadKey();
        }