//private bool AllVarianceZero(Dictionary<string, GaussianStatistics> caseNameToTarget)
        private static bool AllVarianceZero(IEnumerable <Leaf> LeafCollection, Converter <Leaf, SufficientStatistics> caseNameToTarget)
        {
            bool varianceIsZero = false; //If empty input, then return false
            bool firstTime      = true;

            foreach (Leaf leaf in LeafCollection)
            {
                SufficientStatistics stats = caseNameToTarget(leaf);
                if (stats.IsMissing())
                {
                    continue;
                }
                GaussianStatistics gaussianStatistics = (GaussianStatistics)stats;
                SpecialFunctions.CheckCondition((gaussianStatistics.Variance == 0) == (gaussianStatistics.SampleSize == 1), "Variance must be zero exactly when the sample size is 1");
                if (firstTime)
                {
                    firstTime      = false;
                    varianceIsZero = (gaussianStatistics.Variance == 0);
                }
                else
                {
                    SpecialFunctions.CheckCondition(varianceIsZero == (gaussianStatistics.Variance == 0), "If any variances are zero, then all must be zero");
                }
            }
            return(varianceIsZero);
        }
        static public IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > LoadSparseFileEnumeration(string sparseFileName) //where T1:ISufficientStatistics
        {
            Set <string> variablesAlreadySeenSet = new Set <string>();

            Pair <string, Dictionary <string, SufficientStatistics> > variableAndCaseIdToNonMissingValue = null;

            foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(sparseFileName, "var\tcid\tval", false))
            {
                string variable = row["var"];
                if (variableAndCaseIdToNonMissingValue != null && variableAndCaseIdToNonMissingValue.First != variable)
                {
                    yield return(variableAndCaseIdToNonMissingValue);

                    variableAndCaseIdToNonMissingValue = null;
                }
                if (variableAndCaseIdToNonMissingValue == null)
                {
                    SpecialFunctions.CheckCondition(!variablesAlreadySeenSet.Contains(variable), string.Format("Input file ({0}) is not grouped by variable. Variable {1} appears in multiple places", sparseFileName, variable));
                    variablesAlreadySeenSet.AddNew(variable);
                    variableAndCaseIdToNonMissingValue =
                        new Pair <string, Dictionary <string, SufficientStatistics> >(variable, new Dictionary <string, SufficientStatistics>());
                }
                string caseId            = row["cid"];
                SufficientStatistics val = SufficientStatistics.Parse(row["val"]);

                SpecialFunctions.CheckCondition(!variableAndCaseIdToNonMissingValue.Second.ContainsKey(caseId), string.Format("Input file ({0}) for var {1} contains multiple entries for caseId {2}", sparseFileName, variable, caseId));
                variableAndCaseIdToNonMissingValue.Second.Add(caseId, val);
            }

            if (variableAndCaseIdToNonMissingValue != null)
            {
                yield return(variableAndCaseIdToNonMissingValue);
            }
        }
        internal static bool Parse(string val, out SufficientStatistics result)
        {
            result = null;
            if (val.Equals("null", StringComparison.InvariantCultureIgnoreCase))
            {
                result = GetMissingInstance();
                return(false);
            }
            else
            {
                string[] fields = val.Split(',');
                if (!(fields.Length == 3))
                {
                    return(false);
                }
                double mean, variance;
                int    sampleSize;

                if (double.TryParse(fields[0], out mean) &&
                    double.TryParse(fields[1], out variance) &&
                    int.TryParse(fields[2], out sampleSize))
                {
                    result = GaussianStatistics.GetInstance(mean, variance, sampleSize);
                    return(true);
                }

                return(false);
            }
        }
 public static bool Parse(string val, out SufficientStatistics result)
 {
     result = null;
     if (val.Equals("null", StringComparison.CurrentCultureIgnoreCase) || val.Equals("missing", StringComparison.CurrentCultureIgnoreCase))
     {
         result = Singleton;
     }
     return(result != null);
 }
Пример #5
0
        private static Dictionary <string, int> SufficientStatisticsMapToIntDictionaryMap(Converter <Leaf, SufficientStatistics> leafToStatsMap, IEnumerable <Leaf> fullLeafCollection)
        {
            Dictionary <string, int> result = new Dictionary <string, int>(SpecialFunctions.Count(fullLeafCollection));

            foreach (Leaf leaf in fullLeafCollection)
            {
                SufficientStatistics value = leafToStatsMap(leaf);
                if (!value.IsMissing())
                {
                    result.Add(leaf.CaseName, (int)(BooleanStatistics)value);
                }
            }
            return(result);
        }
        internal static bool Parse(string val, out SufficientStatistics result)
        {
            double valAsDouble;

            if (double.TryParse(val, out valAsDouble))
            {
                result = ContinuousStatistics.GetInstance(valAsDouble);
                return(true);
            }
            else
            {
                result = null;
                return(false);
            }
        }
        internal static bool Parse(string val, out SufficientStatistics result)
        {
            int valAsInt;

            if (int.TryParse(val, out valAsInt))
            {
                result = DiscreteStatistics.GetInstance(valAsInt);
                return(true);
            }
            else
            {
                result = null;
                return(false);
            }
        }
        static public Dictionary <string, Dictionary <string, SufficientStatistics> > LoadSparseFileInMemory(string sparseFileName) //where TStat:ISufficientStatistics
        {
            Dictionary <string, Dictionary <string, SufficientStatistics> > variableToCaseIdToNonMissingValue =
                new Dictionary <string, Dictionary <string, SufficientStatistics> >();

            foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(sparseFileName, "var\tcid\tval", false))
            {
                string variable          = row["var"];
                string caseId            = row["cid"];
                SufficientStatistics val = SufficientStatistics.Parse(row["val"]);

                Dictionary <string, SufficientStatistics> caseIdToNonMissingValue = SpecialFunctions.GetValueOrDefault(variableToCaseIdToNonMissingValue, variable);
                SpecialFunctions.CheckCondition(!caseIdToNonMissingValue.ContainsKey(caseId), string.Format("Input file ({0}) for var {1} contains multiple entries for caseId {2}", sparseFileName, variable, caseId));
                caseIdToNonMissingValue.Add(caseId, val);
            }
            return(variableToCaseIdToNonMissingValue);
        }
        public override Converter <Leaf, SufficientStatistics> CreateAlternativeSufficientStatisticsMap(
            Converter <Leaf, SufficientStatistics> predictorDistributionClassFunction,
            Converter <Leaf, SufficientStatistics> targetDistributionClassFunction)
        {
            return(delegate(Leaf leaf)
            {
                DistributionDiscreteJointBinary.DistributionClass jointClass;

                SufficientStatistics predStats = predictorDistributionClassFunction(leaf);
                SufficientStatistics targStats = targetDistributionClassFunction(leaf);

                if (predStats.IsMissing() || targStats.IsMissing())
                {
                    jointClass = DistributionDiscreteJointBinary.DistributionClass.Missing;
                }
                else
                {
                    DiscreteStatistics predClass = (DiscreteStatistics)predStats;
                    DiscreteStatistics targetClass = (DiscreteStatistics)targStats;

                    if (predClass == (int)DistributionDiscreteBinary.DistributionClass.False)
                    {
                        if (targetClass == (int)DistributionDiscreteBinary.DistributionClass.False)
                        {
                            jointClass = DistributionDiscreteJointBinary.DistributionClass.FalseFalse;
                        }
                        else
                        {
                            jointClass = DistributionDiscreteJointBinary.DistributionClass.FalseTrue;
                        }
                    }
                    else
                    {
                        if (targetClass == (int)DistributionDiscreteBinary.DistributionClass.False)
                        {
                            jointClass = DistributionDiscreteJointBinary.DistributionClass.TrueFalse;
                        }
                        else
                        {
                            jointClass = DistributionDiscreteJointBinary.DistributionClass.TrueTrue;
                        }
                    }
                }
                return (DiscreteStatistics)(int)jointClass;
            });
        }
        //public static bool operator !=(BooleanStatistics stats1, BooleanStatistics stats2)
        //{
        //    return !(stats1 == stats2);
        //}
        //public static bool operator ==(BooleanStatistics boolStats1, BooleanStatistics boolStats2)
        //{
        //    DiscreteStatistics d1 = boolStats1 as DiscreteStatistics;
        //    DiscreteStatistics d2 = boolStats2 as DiscreteStatistics;
        //    return d1 == d2;
        //}

        new internal static bool Parse(string val, out SufficientStatistics result)
        {
            result = null;
            if (val.Equals("true", StringComparison.CurrentCultureIgnoreCase) || val == "1")
            {
                result = BooleanStatistics.GetInstance(true);
            }
            else if (val.Equals("false", StringComparison.CurrentCultureIgnoreCase) || val == "0")
            {
                result = BooleanStatistics.GetInstance(false);
            }
            else if (val.Equals("null", StringComparison.CurrentCultureIgnoreCase) || val == "-1")
            {
                result = BooleanStatistics.GetMissingInstance();
            }
            return(result != null);
        }
Пример #11
0
        public static Converter <Leaf, SufficientStatistics> CreateJointMap(Converter <Leaf, SufficientStatistics> predictorMap, Converter <Leaf, SufficientStatistics> targetMap)
        {
            return(delegate(Leaf leaf)
            {
                DistributionDiscreteJoint.DistributionClass jointClass;

                SufficientStatistics predStats = predictorMap(leaf);
                SufficientStatistics targStats = targetMap(leaf);

                if (predStats.IsMissing() || targStats.IsMissing())
                {
                    jointClass = DistributionDiscreteJoint.DistributionClass.Missing;
                }
                else
                {
                    DiscreteStatistics predClass = (DiscreteStatistics)predStats;
                    DiscreteStatistics targetClass = (DiscreteStatistics)targStats;

                    if (predClass == (int)DistributionDiscreteConditional.DistributionClass.False)
                    {
                        if (targetClass == (int)DistributionDiscreteConditional.DistributionClass.False)
                        {
                            jointClass = DistributionDiscreteJoint.DistributionClass.FalseFalse;
                        }
                        else
                        {
                            jointClass = DistributionDiscreteJoint.DistributionClass.FalseTrue;
                        }
                    }
                    else
                    {
                        if (targetClass == (int)DistributionDiscreteConditional.DistributionClass.False)
                        {
                            jointClass = DistributionDiscreteJoint.DistributionClass.TrueFalse;
                        }
                        else
                        {
                            jointClass = DistributionDiscreteJoint.DistributionClass.TrueTrue;
                        }
                    }
                }
                return (DiscreteStatistics)(int)jointClass;
            });
        }
        //public override Converter<Leaf, SufficientStatistics> CreateTargetSufficientStatisticsMap(Dictionary<string, ISufficientStatistics> caseIdToNonMissingValue)
        //{
        //    return ISufficientStatistics.DictionaryToLeafMap(caseIdToNonMissingValue);
        //}

        public override Converter <Leaf, SufficientStatistics> CreateAlternativeSufficientStatisticsMap(
            Converter <Leaf, SufficientStatistics> predictorDistributionClassFunction,
            Converter <Leaf, SufficientStatistics> targetDistributionClassFunction)
        {
            return(delegate(Leaf leaf)
            {
                SufficientStatistics predStats = predictorDistributionClassFunction(leaf);
                SufficientStatistics targetStats = targetDistributionClassFunction(leaf);


                // bail on missing data.
                if (predStats.IsMissing() || targetStats.IsMissing())
                {
                    return GaussianStatistics.GetMissingInstance();
                }
                else
                {
                    return targetStats;
                }
            });
        }
 public Converter <Leaf, SufficientStatistics> CreateSufficientStatisticsMap(Dictionary <string, SufficientStatistics> caseIdToNonMissingValue)
 {
     return(SufficientStatistics.DictionaryToLeafMap(caseIdToNonMissingValue));
 }
Пример #14
0
        //protected override NullDataCollection CreateNullDataGenerator(ModelScorer modelScorer, PhyloTree phyloTree, RangeCollection nullIndexRangeCollection, Dictionary<string, Dictionary<string, BooleanStatistics>> predictorVariableToCaseIdToRealNonMissingValue)
        //{
        //    if (DateTime.Now.Date == new DateTime(2006, 6, 28).Date)  // for testing, force it to use the parametric bootstrap
        //    {
        //        return NullDataCollection.GetInstance(
        //            new NullDataGeneratorAlongTree(modelScorer, phyloTree, (ModelTesterDiscrete)this),
        //            nullIndexRangeCollection,
        //            predictorVariableToCaseIdToRealNonMissingValue);
        //    }


        //    return base.CreateNullDataGenerator(modelScorer, phyloTree, nullIndexRangeCollection, predictorVariableToCaseIdToRealNonMissingValue);
        //}

        //public override Converter<Leaf, SufficientStatistics> CreateTargetSufficientStatisticsMap(Dictionary<string, ISufficientStatistics> caseIdToNonMissingValue)
        //{
        //    return ISufficientStatistics.DictionaryToLeafMap(caseIdToNonMissingValue);
        //}

        //public override Converter<Leaf, SufficientStatistics> CreatePredictorSufficientStatisticsMap(Dictionary<string, BooleanStatistics> caseIdToNonMissingValue)
        //{
        //    return CreateTargetSufficientStatisticsMap(caseIdToNonMissingValue);
        //}

        protected override string CreateReportLine(
            ModelScorer modelScorer,
            PhyloTree phyloTree,
            RowData rowAndTargetData,
            UniversalWorkList workList,
            int rowIndex, int workListCount, int workIndex)
        {
            //!!!there is very similar code in ModelTesterGaussian.cs

            // we're iterating over each predictor (e.g. hla), each target (e.g. position in the sequence,
            // and each possible substring at that position).
            // Then we ask the question, Does the presence of predictor (e.g. hla)
            // influence the probability that target (e.g. mer in position n1pos) will show up?
            // nullIndex specifies whether this is the true data or randomized data.
            Dictionary <string, string> row = rowAndTargetData.Row;
            string predictorVariable        = row[PhyloTree.PredictorVariableColumnName]; // e.g. hla
            string targetVariable           = row[PhyloTree.TargetVariableColumnName];    // e.g. A@182 (amino acid "A" at position 182)
            int    nullIndex = int.Parse(row[PhyloTree.NullIndexColumnName]);

            //Dictionary<string, bool> caseIdToNonMissingPredictorValue = workList.NullIndexToPredictorToCaseIdToNonMissingValue[nullIndex][predictorVariable];
            Dictionary <string, SufficientStatistics> caseIdToNonMissingPredictorValue = rowAndTargetData.PredictorData; //workList.GetCaseIdToNonMissingValueForNullIndexAndPredictorVariable(nullIndex, predictorVariable);
            Dictionary <string, SufficientStatistics> caseIdToNonMissingTargetValue    = rowAndTargetData.TargetData;

            IEnumerator <SufficientStatistics> enumerator = caseIdToNonMissingPredictorValue.Values.GetEnumerator();

            enumerator.MoveNext();
            SufficientStatistics representative = enumerator.Current;
            bool predictorIsBoolean             = representative is BooleanStatistics;

            Converter <Leaf, SufficientStatistics> targetDistributionClassFunction    = CreateSufficientStatisticsMap(caseIdToNonMissingTargetValue);
            Converter <Leaf, SufficientStatistics> predictorDistributionClassFunction = CreateSufficientStatisticsMap(caseIdToNonMissingPredictorValue);

            int[] predictorCounts = predictorIsBoolean ?
                                    phyloTree.CountsOfLeaves(predictorDistributionClassFunction, NullModelDistribution) : new int[2];
            int[] targetCounts = phyloTree.CountsOfLeaves(targetDistributionClassFunction, NullModelDistribution);


            int predictorFalseNameCount = predictorCounts[(int)DistributionDiscreteBinary.DistributionClass.False];
            int predictorTrueNameCount  = predictorCounts[(int)DistributionDiscreteBinary.DistributionClass.True];
            int targetFalseNameCount    = targetCounts[(int)DistributionDiscreteBinary.DistributionClass.False];
            int targetTrueNameCount     = targetCounts[(int)DistributionDiscreteBinary.DistributionClass.True];

            int[] fisherCounts = predictorIsBoolean ?
                                 phyloTree.FisherCounts(predictorDistributionClassFunction, targetDistributionClassFunction) : new int[4];

            int globalNonMissingCount = predictorIsBoolean ?
                                        fisherCounts[0] + fisherCounts[1] + fisherCounts[2] + fisherCounts[3] :
                                        phyloTree.GlobalNonMissingCount(predictorDistributionClassFunction, targetDistributionClassFunction);

            StringBuilder stringBuilder = new StringBuilder(
                SpecialFunctions.CreateTabString(this, rowIndex, workListCount, workIndex, nullIndex, predictorVariable,
                                                 predictorFalseNameCount,
                                                 predictorTrueNameCount,
                                                 predictorTrueNameCount + predictorFalseNameCount,
                                                 targetVariable,
                                                 targetFalseNameCount,
                                                 targetTrueNameCount,
                                                 targetTrueNameCount + targetFalseNameCount,
                                                 fisherCounts[0], fisherCounts[1], fisherCounts[2], fisherCounts[3],
                                                 globalNonMissingCount,
                                                 ""));

            bool ignoreRow = false;

            foreach (int[] counts in new int[][] { predictorIsBoolean?predictorCounts : new int[] { 1, 1 }, targetCounts })
            {
                foreach (int count in counts)
                {
                    if (count == 0)
                    {
                        ignoreRow = true;
                    }
                }
            }

            if (ignoreRow)
            {
                CompleteRowWithNaN(stringBuilder);
            }
            else
            {
                double targetMarginal    = (double)targetTrueNameCount / (double)(targetTrueNameCount + targetFalseNameCount);
                double predictorMarginal = (double)predictorTrueNameCount / (double)(predictorTrueNameCount + predictorFalseNameCount);

                double diff = ComputeLLR(modelScorer, phyloTree, stringBuilder, targetMarginal, predictorMarginal, predictorDistributionClassFunction, targetDistributionClassFunction);


                double pValue = SpecialFunctions.LogLikelihoodRatioTest(Math.Max(diff, 0), ChiSquareDegreesOfFreedom);

                stringBuilder.Append(SpecialFunctions.CreateTabString(diff, pValue));
            }

            return(stringBuilder.ToString());
        }
        public override EvaluationResults EvaluateModelOnData(Converter <Leaf, SufficientStatistics> v1, Converter <Leaf, SufficientStatistics> v2)
        {
            List <Leaf> nonMissingLeaves = new List <Leaf>(100);
            int         seed             = 0;

            foreach (Leaf leaf in ModelScorer.PhyloTree.LeafCollection)
            {
                SufficientStatistics class1 = v1(leaf);
                SufficientStatistics class2 = v2(leaf);
                if (!class1.IsMissing() && !class2.IsMissing())
                {
                    nonMissingLeaves.Add(leaf);
                    seed ^= (leaf.CaseName + class1.ToString() + class2.ToString()).GetHashCode();
                }
            }

            Random rand = new Random(seed);

            nonMissingLeaves = SpecialFunctions.Shuffle(nonMissingLeaves, ref rand);

            int groupSize = nonMissingLeaves.Count / _crossValidateCount;

            EvaluationResultsCrossValidate combinedResults = null;
            double testAltLLSum  = 0;   // for debugging
            double testNullLLSum = 0;   // for debugging

            for (int i = 0; i < _crossValidateCount; i++)
            {
                int        testStart  = i * groupSize;
                int        trainStart = testStart + groupSize;
                Set <Leaf> trainSet   = new Set <Leaf>(SpecialFunctions.SubList(nonMissingLeaves, trainStart, nonMissingLeaves.Count - trainStart));
                trainSet.AddNewRange(SpecialFunctions.SubList(nonMissingLeaves, 0, testStart));

                Converter <Leaf, SufficientStatistics> v1Train = CreateFilteredMap(v1, trainSet);
                Converter <Leaf, SufficientStatistics> v2Train = CreateFilteredMap(v2, trainSet);

                EvaluationResults trainingResults    = InternalEvaluator.EvaluateModelOnData(v1Train, v2Train);
                EvaluationResults testAndTrainResult = InternalEvaluator.EvaluateModelOnDataGivenParams(v1, v2, trainingResults);
                EvaluationResultsTestGivenTrain testGivenTrainResult = EvaluationResultsTestGivenTrain.GetInstance(this, trainingResults, testAndTrainResult);

                if (combinedResults == null)
                {
                    combinedResults = EvaluationResultsCrossValidate.GetInstance(this, testGivenTrainResult);
                }
                else
                {
                    combinedResults = combinedResults.AddNewResults(testGivenTrainResult);
                }

                if (double.IsInfinity(combinedResults.AltLL))   // no point in continuing...infinity will kill everything.
                {
                    break;
                }
#if DEBUG
                double            eps = 1E-10;
                EvaluationResults testTrainingResults = InternalEvaluator.EvaluateModelOnDataGivenParams(v1Train, v2Train, trainingResults);
                Debug.Assert(ComplexNumber.ApproxEqual(testTrainingResults.AltLL, trainingResults.AltLL, eps) &&
                             ComplexNumber.ApproxEqual(testTrainingResults.NullLL, trainingResults.NullLL, eps));
                //Debug.Assert(testTrainingResults.Equals(trainingResults));

                double newNullLL = testAndTrainResult.NullLL - trainingResults.NullLL;
                double newAltLL  = testAndTrainResult.AltLL - trainingResults.AltLL;

                Debug.Assert(ComplexNumber.ApproxEqual(newNullLL, testGivenTrainResult.NullLL, eps));
                Debug.Assert(ComplexNumber.ApproxEqual(newAltLL, testGivenTrainResult.AltLL, eps));

                testNullLLSum += newNullLL;
                testAltLLSum  += newAltLL;

                Debug.Assert(ComplexNumber.ApproxEqual(testNullLLSum, combinedResults.NullLL, eps), "Combined result has wrong NullLL");
                Debug.Assert(ComplexNumber.ApproxEqual(testAltLLSum, combinedResults.AltLL, eps), "Combined result has wrong AltLL");
#endif
            }
            return(combinedResults);
        }