//private bool AllVarianceZero(Dictionary<string, GaussianStatistics> caseNameToTarget) private static bool AllVarianceZero(IEnumerable <Leaf> LeafCollection, Converter <Leaf, SufficientStatistics> caseNameToTarget) { bool varianceIsZero = false; //If empty input, then return false bool firstTime = true; foreach (Leaf leaf in LeafCollection) { SufficientStatistics stats = caseNameToTarget(leaf); if (stats.IsMissing()) { continue; } GaussianStatistics gaussianStatistics = (GaussianStatistics)stats; SpecialFunctions.CheckCondition((gaussianStatistics.Variance == 0) == (gaussianStatistics.SampleSize == 1), "Variance must be zero exactly when the sample size is 1"); if (firstTime) { firstTime = false; varianceIsZero = (gaussianStatistics.Variance == 0); } else { SpecialFunctions.CheckCondition(varianceIsZero == (gaussianStatistics.Variance == 0), "If any variances are zero, then all must be zero"); } } return(varianceIsZero); }
static public IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > LoadSparseFileEnumeration(string sparseFileName) //where T1:ISufficientStatistics { Set <string> variablesAlreadySeenSet = new Set <string>(); Pair <string, Dictionary <string, SufficientStatistics> > variableAndCaseIdToNonMissingValue = null; foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(sparseFileName, "var\tcid\tval", false)) { string variable = row["var"]; if (variableAndCaseIdToNonMissingValue != null && variableAndCaseIdToNonMissingValue.First != variable) { yield return(variableAndCaseIdToNonMissingValue); variableAndCaseIdToNonMissingValue = null; } if (variableAndCaseIdToNonMissingValue == null) { SpecialFunctions.CheckCondition(!variablesAlreadySeenSet.Contains(variable), string.Format("Input file ({0}) is not grouped by variable. Variable {1} appears in multiple places", sparseFileName, variable)); variablesAlreadySeenSet.AddNew(variable); variableAndCaseIdToNonMissingValue = new Pair <string, Dictionary <string, SufficientStatistics> >(variable, new Dictionary <string, SufficientStatistics>()); } string caseId = row["cid"]; SufficientStatistics val = SufficientStatistics.Parse(row["val"]); SpecialFunctions.CheckCondition(!variableAndCaseIdToNonMissingValue.Second.ContainsKey(caseId), string.Format("Input file ({0}) for var {1} contains multiple entries for caseId {2}", sparseFileName, variable, caseId)); variableAndCaseIdToNonMissingValue.Second.Add(caseId, val); } if (variableAndCaseIdToNonMissingValue != null) { yield return(variableAndCaseIdToNonMissingValue); } }
internal static bool Parse(string val, out SufficientStatistics result) { result = null; if (val.Equals("null", StringComparison.InvariantCultureIgnoreCase)) { result = GetMissingInstance(); return(false); } else { string[] fields = val.Split(','); if (!(fields.Length == 3)) { return(false); } double mean, variance; int sampleSize; if (double.TryParse(fields[0], out mean) && double.TryParse(fields[1], out variance) && int.TryParse(fields[2], out sampleSize)) { result = GaussianStatistics.GetInstance(mean, variance, sampleSize); return(true); } return(false); } }
public static bool Parse(string val, out SufficientStatistics result) { result = null; if (val.Equals("null", StringComparison.CurrentCultureIgnoreCase) || val.Equals("missing", StringComparison.CurrentCultureIgnoreCase)) { result = Singleton; } return(result != null); }
private static Dictionary <string, int> SufficientStatisticsMapToIntDictionaryMap(Converter <Leaf, SufficientStatistics> leafToStatsMap, IEnumerable <Leaf> fullLeafCollection) { Dictionary <string, int> result = new Dictionary <string, int>(SpecialFunctions.Count(fullLeafCollection)); foreach (Leaf leaf in fullLeafCollection) { SufficientStatistics value = leafToStatsMap(leaf); if (!value.IsMissing()) { result.Add(leaf.CaseName, (int)(BooleanStatistics)value); } } return(result); }
internal static bool Parse(string val, out SufficientStatistics result) { double valAsDouble; if (double.TryParse(val, out valAsDouble)) { result = ContinuousStatistics.GetInstance(valAsDouble); return(true); } else { result = null; return(false); } }
internal static bool Parse(string val, out SufficientStatistics result) { int valAsInt; if (int.TryParse(val, out valAsInt)) { result = DiscreteStatistics.GetInstance(valAsInt); return(true); } else { result = null; return(false); } }
static public Dictionary <string, Dictionary <string, SufficientStatistics> > LoadSparseFileInMemory(string sparseFileName) //where TStat:ISufficientStatistics { Dictionary <string, Dictionary <string, SufficientStatistics> > variableToCaseIdToNonMissingValue = new Dictionary <string, Dictionary <string, SufficientStatistics> >(); foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(sparseFileName, "var\tcid\tval", false)) { string variable = row["var"]; string caseId = row["cid"]; SufficientStatistics val = SufficientStatistics.Parse(row["val"]); Dictionary <string, SufficientStatistics> caseIdToNonMissingValue = SpecialFunctions.GetValueOrDefault(variableToCaseIdToNonMissingValue, variable); SpecialFunctions.CheckCondition(!caseIdToNonMissingValue.ContainsKey(caseId), string.Format("Input file ({0}) for var {1} contains multiple entries for caseId {2}", sparseFileName, variable, caseId)); caseIdToNonMissingValue.Add(caseId, val); } return(variableToCaseIdToNonMissingValue); }
public override Converter <Leaf, SufficientStatistics> CreateAlternativeSufficientStatisticsMap( Converter <Leaf, SufficientStatistics> predictorDistributionClassFunction, Converter <Leaf, SufficientStatistics> targetDistributionClassFunction) { return(delegate(Leaf leaf) { DistributionDiscreteJointBinary.DistributionClass jointClass; SufficientStatistics predStats = predictorDistributionClassFunction(leaf); SufficientStatistics targStats = targetDistributionClassFunction(leaf); if (predStats.IsMissing() || targStats.IsMissing()) { jointClass = DistributionDiscreteJointBinary.DistributionClass.Missing; } else { DiscreteStatistics predClass = (DiscreteStatistics)predStats; DiscreteStatistics targetClass = (DiscreteStatistics)targStats; if (predClass == (int)DistributionDiscreteBinary.DistributionClass.False) { if (targetClass == (int)DistributionDiscreteBinary.DistributionClass.False) { jointClass = DistributionDiscreteJointBinary.DistributionClass.FalseFalse; } else { jointClass = DistributionDiscreteJointBinary.DistributionClass.FalseTrue; } } else { if (targetClass == (int)DistributionDiscreteBinary.DistributionClass.False) { jointClass = DistributionDiscreteJointBinary.DistributionClass.TrueFalse; } else { jointClass = DistributionDiscreteJointBinary.DistributionClass.TrueTrue; } } } return (DiscreteStatistics)(int)jointClass; }); }
//public static bool operator !=(BooleanStatistics stats1, BooleanStatistics stats2) //{ // return !(stats1 == stats2); //} //public static bool operator ==(BooleanStatistics boolStats1, BooleanStatistics boolStats2) //{ // DiscreteStatistics d1 = boolStats1 as DiscreteStatistics; // DiscreteStatistics d2 = boolStats2 as DiscreteStatistics; // return d1 == d2; //} new internal static bool Parse(string val, out SufficientStatistics result) { result = null; if (val.Equals("true", StringComparison.CurrentCultureIgnoreCase) || val == "1") { result = BooleanStatistics.GetInstance(true); } else if (val.Equals("false", StringComparison.CurrentCultureIgnoreCase) || val == "0") { result = BooleanStatistics.GetInstance(false); } else if (val.Equals("null", StringComparison.CurrentCultureIgnoreCase) || val == "-1") { result = BooleanStatistics.GetMissingInstance(); } return(result != null); }
public static Converter <Leaf, SufficientStatistics> CreateJointMap(Converter <Leaf, SufficientStatistics> predictorMap, Converter <Leaf, SufficientStatistics> targetMap) { return(delegate(Leaf leaf) { DistributionDiscreteJoint.DistributionClass jointClass; SufficientStatistics predStats = predictorMap(leaf); SufficientStatistics targStats = targetMap(leaf); if (predStats.IsMissing() || targStats.IsMissing()) { jointClass = DistributionDiscreteJoint.DistributionClass.Missing; } else { DiscreteStatistics predClass = (DiscreteStatistics)predStats; DiscreteStatistics targetClass = (DiscreteStatistics)targStats; if (predClass == (int)DistributionDiscreteConditional.DistributionClass.False) { if (targetClass == (int)DistributionDiscreteConditional.DistributionClass.False) { jointClass = DistributionDiscreteJoint.DistributionClass.FalseFalse; } else { jointClass = DistributionDiscreteJoint.DistributionClass.FalseTrue; } } else { if (targetClass == (int)DistributionDiscreteConditional.DistributionClass.False) { jointClass = DistributionDiscreteJoint.DistributionClass.TrueFalse; } else { jointClass = DistributionDiscreteJoint.DistributionClass.TrueTrue; } } } return (DiscreteStatistics)(int)jointClass; }); }
//public override Converter<Leaf, SufficientStatistics> CreateTargetSufficientStatisticsMap(Dictionary<string, ISufficientStatistics> caseIdToNonMissingValue) //{ // return ISufficientStatistics.DictionaryToLeafMap(caseIdToNonMissingValue); //} public override Converter <Leaf, SufficientStatistics> CreateAlternativeSufficientStatisticsMap( Converter <Leaf, SufficientStatistics> predictorDistributionClassFunction, Converter <Leaf, SufficientStatistics> targetDistributionClassFunction) { return(delegate(Leaf leaf) { SufficientStatistics predStats = predictorDistributionClassFunction(leaf); SufficientStatistics targetStats = targetDistributionClassFunction(leaf); // bail on missing data. if (predStats.IsMissing() || targetStats.IsMissing()) { return GaussianStatistics.GetMissingInstance(); } else { return targetStats; } }); }
public Converter <Leaf, SufficientStatistics> CreateSufficientStatisticsMap(Dictionary <string, SufficientStatistics> caseIdToNonMissingValue) { return(SufficientStatistics.DictionaryToLeafMap(caseIdToNonMissingValue)); }
//protected override NullDataCollection CreateNullDataGenerator(ModelScorer modelScorer, PhyloTree phyloTree, RangeCollection nullIndexRangeCollection, Dictionary<string, Dictionary<string, BooleanStatistics>> predictorVariableToCaseIdToRealNonMissingValue) //{ // if (DateTime.Now.Date == new DateTime(2006, 6, 28).Date) // for testing, force it to use the parametric bootstrap // { // return NullDataCollection.GetInstance( // new NullDataGeneratorAlongTree(modelScorer, phyloTree, (ModelTesterDiscrete)this), // nullIndexRangeCollection, // predictorVariableToCaseIdToRealNonMissingValue); // } // return base.CreateNullDataGenerator(modelScorer, phyloTree, nullIndexRangeCollection, predictorVariableToCaseIdToRealNonMissingValue); //} //public override Converter<Leaf, SufficientStatistics> CreateTargetSufficientStatisticsMap(Dictionary<string, ISufficientStatistics> caseIdToNonMissingValue) //{ // return ISufficientStatistics.DictionaryToLeafMap(caseIdToNonMissingValue); //} //public override Converter<Leaf, SufficientStatistics> CreatePredictorSufficientStatisticsMap(Dictionary<string, BooleanStatistics> caseIdToNonMissingValue) //{ // return CreateTargetSufficientStatisticsMap(caseIdToNonMissingValue); //} protected override string CreateReportLine( ModelScorer modelScorer, PhyloTree phyloTree, RowData rowAndTargetData, UniversalWorkList workList, int rowIndex, int workListCount, int workIndex) { //!!!there is very similar code in ModelTesterGaussian.cs // we're iterating over each predictor (e.g. hla), each target (e.g. position in the sequence, // and each possible substring at that position). // Then we ask the question, Does the presence of predictor (e.g. hla) // influence the probability that target (e.g. mer in position n1pos) will show up? // nullIndex specifies whether this is the true data or randomized data. Dictionary <string, string> row = rowAndTargetData.Row; string predictorVariable = row[PhyloTree.PredictorVariableColumnName]; // e.g. hla string targetVariable = row[PhyloTree.TargetVariableColumnName]; // e.g. A@182 (amino acid "A" at position 182) int nullIndex = int.Parse(row[PhyloTree.NullIndexColumnName]); //Dictionary<string, bool> caseIdToNonMissingPredictorValue = workList.NullIndexToPredictorToCaseIdToNonMissingValue[nullIndex][predictorVariable]; Dictionary <string, SufficientStatistics> caseIdToNonMissingPredictorValue = rowAndTargetData.PredictorData; //workList.GetCaseIdToNonMissingValueForNullIndexAndPredictorVariable(nullIndex, predictorVariable); Dictionary <string, SufficientStatistics> caseIdToNonMissingTargetValue = rowAndTargetData.TargetData; IEnumerator <SufficientStatistics> enumerator = caseIdToNonMissingPredictorValue.Values.GetEnumerator(); enumerator.MoveNext(); SufficientStatistics representative = enumerator.Current; bool predictorIsBoolean = representative is BooleanStatistics; Converter <Leaf, SufficientStatistics> targetDistributionClassFunction = CreateSufficientStatisticsMap(caseIdToNonMissingTargetValue); Converter <Leaf, SufficientStatistics> predictorDistributionClassFunction = CreateSufficientStatisticsMap(caseIdToNonMissingPredictorValue); int[] predictorCounts = predictorIsBoolean ? phyloTree.CountsOfLeaves(predictorDistributionClassFunction, NullModelDistribution) : new int[2]; int[] targetCounts = phyloTree.CountsOfLeaves(targetDistributionClassFunction, NullModelDistribution); int predictorFalseNameCount = predictorCounts[(int)DistributionDiscreteBinary.DistributionClass.False]; int predictorTrueNameCount = predictorCounts[(int)DistributionDiscreteBinary.DistributionClass.True]; int targetFalseNameCount = targetCounts[(int)DistributionDiscreteBinary.DistributionClass.False]; int targetTrueNameCount = targetCounts[(int)DistributionDiscreteBinary.DistributionClass.True]; int[] fisherCounts = predictorIsBoolean ? phyloTree.FisherCounts(predictorDistributionClassFunction, targetDistributionClassFunction) : new int[4]; int globalNonMissingCount = predictorIsBoolean ? fisherCounts[0] + fisherCounts[1] + fisherCounts[2] + fisherCounts[3] : phyloTree.GlobalNonMissingCount(predictorDistributionClassFunction, targetDistributionClassFunction); StringBuilder stringBuilder = new StringBuilder( SpecialFunctions.CreateTabString(this, rowIndex, workListCount, workIndex, nullIndex, predictorVariable, predictorFalseNameCount, predictorTrueNameCount, predictorTrueNameCount + predictorFalseNameCount, targetVariable, targetFalseNameCount, targetTrueNameCount, targetTrueNameCount + targetFalseNameCount, fisherCounts[0], fisherCounts[1], fisherCounts[2], fisherCounts[3], globalNonMissingCount, "")); bool ignoreRow = false; foreach (int[] counts in new int[][] { predictorIsBoolean?predictorCounts : new int[] { 1, 1 }, targetCounts }) { foreach (int count in counts) { if (count == 0) { ignoreRow = true; } } } if (ignoreRow) { CompleteRowWithNaN(stringBuilder); } else { double targetMarginal = (double)targetTrueNameCount / (double)(targetTrueNameCount + targetFalseNameCount); double predictorMarginal = (double)predictorTrueNameCount / (double)(predictorTrueNameCount + predictorFalseNameCount); double diff = ComputeLLR(modelScorer, phyloTree, stringBuilder, targetMarginal, predictorMarginal, predictorDistributionClassFunction, targetDistributionClassFunction); double pValue = SpecialFunctions.LogLikelihoodRatioTest(Math.Max(diff, 0), ChiSquareDegreesOfFreedom); stringBuilder.Append(SpecialFunctions.CreateTabString(diff, pValue)); } return(stringBuilder.ToString()); }
public override EvaluationResults EvaluateModelOnData(Converter <Leaf, SufficientStatistics> v1, Converter <Leaf, SufficientStatistics> v2) { List <Leaf> nonMissingLeaves = new List <Leaf>(100); int seed = 0; foreach (Leaf leaf in ModelScorer.PhyloTree.LeafCollection) { SufficientStatistics class1 = v1(leaf); SufficientStatistics class2 = v2(leaf); if (!class1.IsMissing() && !class2.IsMissing()) { nonMissingLeaves.Add(leaf); seed ^= (leaf.CaseName + class1.ToString() + class2.ToString()).GetHashCode(); } } Random rand = new Random(seed); nonMissingLeaves = SpecialFunctions.Shuffle(nonMissingLeaves, ref rand); int groupSize = nonMissingLeaves.Count / _crossValidateCount; EvaluationResultsCrossValidate combinedResults = null; double testAltLLSum = 0; // for debugging double testNullLLSum = 0; // for debugging for (int i = 0; i < _crossValidateCount; i++) { int testStart = i * groupSize; int trainStart = testStart + groupSize; Set <Leaf> trainSet = new Set <Leaf>(SpecialFunctions.SubList(nonMissingLeaves, trainStart, nonMissingLeaves.Count - trainStart)); trainSet.AddNewRange(SpecialFunctions.SubList(nonMissingLeaves, 0, testStart)); Converter <Leaf, SufficientStatistics> v1Train = CreateFilteredMap(v1, trainSet); Converter <Leaf, SufficientStatistics> v2Train = CreateFilteredMap(v2, trainSet); EvaluationResults trainingResults = InternalEvaluator.EvaluateModelOnData(v1Train, v2Train); EvaluationResults testAndTrainResult = InternalEvaluator.EvaluateModelOnDataGivenParams(v1, v2, trainingResults); EvaluationResultsTestGivenTrain testGivenTrainResult = EvaluationResultsTestGivenTrain.GetInstance(this, trainingResults, testAndTrainResult); if (combinedResults == null) { combinedResults = EvaluationResultsCrossValidate.GetInstance(this, testGivenTrainResult); } else { combinedResults = combinedResults.AddNewResults(testGivenTrainResult); } if (double.IsInfinity(combinedResults.AltLL)) // no point in continuing...infinity will kill everything. { break; } #if DEBUG double eps = 1E-10; EvaluationResults testTrainingResults = InternalEvaluator.EvaluateModelOnDataGivenParams(v1Train, v2Train, trainingResults); Debug.Assert(ComplexNumber.ApproxEqual(testTrainingResults.AltLL, trainingResults.AltLL, eps) && ComplexNumber.ApproxEqual(testTrainingResults.NullLL, trainingResults.NullLL, eps)); //Debug.Assert(testTrainingResults.Equals(trainingResults)); double newNullLL = testAndTrainResult.NullLL - trainingResults.NullLL; double newAltLL = testAndTrainResult.AltLL - trainingResults.AltLL; Debug.Assert(ComplexNumber.ApproxEqual(newNullLL, testGivenTrainResult.NullLL, eps)); Debug.Assert(ComplexNumber.ApproxEqual(newAltLL, testGivenTrainResult.AltLL, eps)); testNullLLSum += newNullLL; testAltLLSum += newAltLL; Debug.Assert(ComplexNumber.ApproxEqual(testNullLLSum, combinedResults.NullLL, eps), "Combined result has wrong NullLL"); Debug.Assert(ComplexNumber.ApproxEqual(testAltLLSum, combinedResults.AltLL, eps), "Combined result has wrong AltLL"); #endif } return(combinedResults); }