protected override string CreateReportLine( ModelScorer modelScorer, PhyloTree phyloTree, RowData rowAndTargetData, UniversalWorkList workList, int rowIndex, int workListCount, int workIndex) { Dictionary <string, string> row = rowAndTargetData.Row; string predictorVariable = row[PhyloTree.PredictorVariableColumnName]; // e.g. hla string targetVariable = row[PhyloTree.TargetVariableColumnName]; // e.g. A@182 (amino acid "A" at position 182) int nullIndex = int.Parse(row[PhyloTree.NullIndexColumnName]); //Dictionary<string, bool> caseIdToNonMissingPredictorValue = workList.NullIndexToPredictorToCaseIdToNonMissingValue[nullIndex][predictorVariable]; Dictionary <string, SufficientStatistics> caseIdToNonMissingPredictorValue = rowAndTargetData.PredictorData; //workList.GetCaseIdToNonMissingValueForNullIndexAndPredictorVariable(nullIndex, predictorVariable); Dictionary <string, SufficientStatistics> caseIdToNonMissingTargetValue = rowAndTargetData.TargetData; TwoByTwo fishers2by2 = TwoByTwo.GetInstance( SufficientStatisticsMapToIntMap(caseIdToNonMissingPredictorValue), SufficientStatisticsMapToIntMap(caseIdToNonMissingTargetValue)); double pValue = fishers2by2.FisherExactTest; string reportLine = SpecialFunctions.CreateTabString(this, rowIndex, workListCount, workIndex, nullIndex, predictorVariable, targetVariable, fishers2by2.CountsString(), fishers2by2.FisherExactTest); return(reportLine); }
public static UniversalWorkList GetInstance( IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > predictorNameAndCaseIdToNonMissingValueEnumeration, IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > targetNameAndCaseIdToNonMissingValueEnumeration, //Dictionary<string, Dictionary<string, SufficientStatistics>> predictorVariableToCaseIdToRealNonMissingValue, NullDataCollection nullDataCollection, RangeCollection nullIndexRange, KeepTest <Dictionary <string, string> > keepTest ) { //SpecialFunctions.CheckCondition(-1 <= nullIndexStart && nullIndexStart <= nullIndexLast); bool enumeratePairs = keepTest is KeepPredictorTargetPairs; if (keepTest is KeepCollection <Dictionary <string, string> > ) { foreach (KeepTest <Dictionary <string, string> > keepTestInCollection in ((KeepCollection <Dictionary <string, string> >)keepTest).KeepTestCollection) { if (keepTestInCollection is KeepPredictorTargetPairs) { enumeratePairs = true; } } } UniversalWorkList aUniversalWorkList; if (enumeratePairs) { aUniversalWorkList = UniversalWorkListPredTargPairs.GetInstance( predictorNameAndCaseIdToNonMissingValueEnumeration, targetNameAndCaseIdToNonMissingValueEnumeration, nullDataCollection, nullIndexRange, keepTest ); } else { aUniversalWorkList = new UniversalWorkList( predictorNameAndCaseIdToNonMissingValueEnumeration, targetNameAndCaseIdToNonMissingValueEnumeration, nullDataCollection, nullIndexRange, keepTest ); } // aUniversalWorkList._predictorNameAndCaseIdToNonMissingValueEnumeration = predictorNameAndCaseIdToNonMissingValueEnumeration; // aUniversalWorkList._targetNameAndCaseIdToNonMissingValueEnumeration = targetNameAndCaseIdToNonMissingValueEnumeration; //// aUniversalWorkList._targetVariables = targetVariables; //// aUniversalWorkList._predictorVariableToCaseIdToNonMissingValue = predictorVariableToCaseIdToRealNonMissingValue; // aUniversalWorkList._keepTest = keepTest; // aUniversalWorkList._nullDataCollection = nullDataCollection; // aUniversalWorkList._nullIndexRange = nullIndexRange; return(aUniversalWorkList); }
private string CreateReportLine( ModelEvaluator modelEvaluator, RowData rowAndTargetData, UniversalWorkList workList, int rowIndex, int workListCount, int workIndex) { Dictionary <string, string> row = rowAndTargetData.Row; string predictorVariable = row[Tabulate.PredictorVariableColumnName]; string targetVariable = row[Tabulate.TargetVariableColumnName]; int nullIndex = int.Parse(row[Tabulate.NullIndexColumnName]); Dictionary <string, SufficientStatistics> caseIdToNonMissingPredictorValue = rowAndTargetData.PredictorData; Dictionary <string, SufficientStatistics> caseIdToNonMissingTargetValue = rowAndTargetData.TargetData; Converter <Leaf, SufficientStatistics> predictorDistributionClassFunction = CreateSufficientStatisticsMap(caseIdToNonMissingPredictorValue); Converter <Leaf, SufficientStatistics> targetDistributionClassFunction = CreateSufficientStatisticsMap(caseIdToNonMissingTargetValue); EvaluationResults results = modelEvaluator.EvaluateModelOnData(predictorDistributionClassFunction, targetDistributionClassFunction); string reportLine = SpecialFunctions.CreateTabString( results.ModelEvaluator.Name, rowIndex, workListCount, workIndex, nullIndex, predictorVariable, targetVariable, results.ToString()); return(reportLine); }
public void Run( ModelScorer modelScorer, PhyloTree phyloTree, string predictorSparseFileName, string targetSparseFileName, string leafDistributionName, string nullDataGeneratorName, KeepTest <Dictionary <string, string> > keepTest, RangeCollection skipRowIndexRangeCollectionOrNull, string shortName, string outputDirectoryName, RangeCollection pieceIndexRangeCollection, int pieceCount, RangeCollection nullIndexRangeCollection, string optimizerName) { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Directory.CreateDirectory(outputDirectoryName); string outputFileName = string.Format(@"{0}\{1}.{2}.{3}.{4}.{5}.{6}{7}.txt", outputDirectoryName, shortName, leafDistributionName, nullDataGeneratorName, nullIndexRangeCollection, pieceCount, pieceIndexRangeCollection, skipRowIndexRangeCollectionOrNull == null ? "" : ".Skip" + skipRowIndexRangeCollectionOrNull.Count().ToString() ); #region from PhyloTree refactor //Dictionary<string, Dictionary<string, bool>> predictorVariableToCaseIdToRealNonMissingValue = LoadSparseFileInMemory<bool>(predictorSparseFileName); //IEnumerable<Pair<string, Dictionary<string, T>>> targetNameAndCaseIdToNonMissingValueEnumeration = LoadSparseFileEnumeration<T>(targetSparseFileName); //NullDataCollection nullDataGenerator = // NullDataCollection.GetInstance(this, modelTester, nullIndexRangeCollection, predictorVariableToCaseIdToRealNonMissingValue); //UniversalWorkList<T> workList = UniversalWorkList<T>.GetInstance( // predictorVariableToCaseIdToRealNonMissingValue, // targetNameAndCaseIdToNonMissingValueEnumeration, // nullDataGenerator, nullIndexRangeCollection, keepTest); #endregion bool speedOverMemory = true; IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > predictorNameAndCaseIdToNonMissingValueEnumeration = CreateNameAndCaseIdToNonMissingValueEnumeration(predictorSparseFileName, speedOverMemory); IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > targetNameAndCaseIdToNonMissingValueEnumeration = CreateNameAndCaseIdToNonMissingValueEnumeration(targetSparseFileName, speedOverMemory); NullDataCollection nullDataGenerator = CreateNullDataGenerator(nullDataGeneratorName, modelScorer, phyloTree, nullIndexRangeCollection, predictorNameAndCaseIdToNonMissingValueEnumeration, targetNameAndCaseIdToNonMissingValueEnumeration); UniversalWorkList workList = UniversalWorkList.GetInstance( predictorNameAndCaseIdToNonMissingValueEnumeration, targetNameAndCaseIdToNonMissingValueEnumeration, nullDataGenerator, nullIndexRangeCollection, keepTest); int workListCount = SpecialFunctions.Count(workList.List()); int effectiveWorkListCount; if (skipRowIndexRangeCollectionOrNull == null) { effectiveWorkListCount = workListCount; } else { effectiveWorkListCount = 0; for (int iRowIndex = 0; iRowIndex < workListCount; iRowIndex++) { if (!skipRowIndexRangeCollectionOrNull.Contains(iRowIndex)) { effectiveWorkListCount++; } } } Console.WriteLine("{0} Total rows. Skipping {1} of them.", workListCount, workListCount - effectiveWorkListCount); using (TextWriter textWriter = File.CreateText(outputFileName)) { textWriter.WriteLine(Header); int rowIndex = -1; int effectiveRowIndex = -1; foreach (RowData rowAndTargetData in workList.List()) { //!!!make all these parameters and the calculation a class ++rowIndex; Debug.Assert(rowIndex < workListCount); // real assert if (skipRowIndexRangeCollectionOrNull == null || !skipRowIndexRangeCollectionOrNull.Contains(rowIndex)) { ++effectiveRowIndex; int workIndex = ExtractWorkIndex(effectiveRowIndex, pieceCount, effectiveWorkListCount); if (pieceIndexRangeCollection.Contains(workIndex)) { Debug.WriteLine("WorkItemIndex " + rowIndex.ToString()); string reportLine; try { reportLine = CreateReportLine(modelScorer, phyloTree, rowAndTargetData, workList, rowIndex, workListCount, workIndex); } catch (OutOfMemoryException) { Console.WriteLine("OUT OF MEMORY!! Clearing cache and trying to recover where we left off."); modelScorer.ClearCache(); reportLine = CreateReportLine(modelScorer, phyloTree, rowAndTargetData, workList, rowIndex, workListCount, workIndex); } textWriter.WriteLine(reportLine); textWriter.Flush(); } } } } stopwatch.Stop(); Console.WriteLine("Running time: " + stopwatch.Elapsed); }
public void ScoreTree( ModelScorer modelScorer, PhyloTree phyloTree, string predictorSparseFileName, string targetSparseFileName, string predictorVariableName, string targetVariableName, double[] nullModelArgs, double[] altModelArgs) { //Dictionary<string, Dictionary<string, SufficientStatistics>> predictorVariableToCaseIdToRealNonMissingValue = LoadSparseFileInMemory(predictorSparseFileName); IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > predictorNameAndCaseIdToNonMissingValueEnumeration = LoadSparseFileEnumeration(predictorSparseFileName); IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > targetNameAndCaseIdToNonMissingValueEnumeration = LoadSparseFileEnumeration(targetSparseFileName); RangeCollection nullIndexRangeCollection = RangeCollection.GetInstance(-1, -1); NullDataCollection nullDataGenerator = CreateNullDataGenerator("PredictorPermutation", modelScorer, phyloTree, nullIndexRangeCollection, predictorNameAndCaseIdToNonMissingValueEnumeration, targetNameAndCaseIdToNonMissingValueEnumeration); UniversalWorkList workList = UniversalWorkList.GetInstance( predictorNameAndCaseIdToNonMissingValueEnumeration, targetNameAndCaseIdToNonMissingValueEnumeration, //targetNameAndCaseIdToNonMissingValueEnumeration, nullDataGenerator, nullIndexRangeCollection, AlwaysKeep <Dictionary <string, string> > .GetInstance()); foreach (RowData rowAndTargetData in workList.List()) { if (rowAndTargetData.Row[PhyloTree.PredictorVariableColumnName] == predictorVariableName && rowAndTargetData.Row[PhyloTree.TargetVariableColumnName] == targetVariableName) { Dictionary <string, SufficientStatistics> caseIdToNonNullPredictorValue = rowAndTargetData.PredictorData;//workList.GetCaseIdToNonMissingValueForNullIndexAndPredictorVariable(-1, predictorVariableName); Dictionary <string, SufficientStatistics> caseIdToNonMissingTargetValue = rowAndTargetData.TargetData; Converter <Leaf, SufficientStatistics> targetDistributionMap = CreateSufficientStatisticsMap(caseIdToNonMissingTargetValue); Converter <Leaf, SufficientStatistics> predictorDistributionClassFunction = CreateSufficientStatisticsMap(caseIdToNonNullPredictorValue); Converter <Leaf, SufficientStatistics> altDistributionMap = CreateAlternativeSufficientStatisticsMap(predictorDistributionClassFunction, targetDistributionMap); double logLikelihood; Score scoreIndTarget, scoreIndPredictor, scoreAlt; MessageInitializer messageInitializer; OptimizationParameterList nullParams = NullModelDistribution.GetParameters(nullModelArgs); OptimizationParameterList altParams = AltModelDistribution.GetParameters(altModelArgs); Console.WriteLine(SpecialFunctions.CreateTabString("Variable", nullParams.ToStringHeader(), "LogL")); messageInitializer = modelScorer.CreateMessageInitializer(predictorDistributionClassFunction, targetDistributionMap, NullModelDistribution); logLikelihood = modelScorer.ComputeLogLikelihoodModelGivenData(messageInitializer, nullParams); scoreIndTarget = Score.GetInstance(logLikelihood, nullParams); Console.WriteLine("Target\t" + scoreIndTarget); messageInitializer = modelScorer.CreateMessageInitializer(targetDistributionMap, predictorDistributionClassFunction, NullModelDistribution); logLikelihood = modelScorer.ComputeLogLikelihoodModelGivenData(messageInitializer, nullParams); modelScorer.ComputeLogLikelihoodModelGivenData(messageInitializer, nullParams); scoreIndPredictor = Score.GetInstance(logLikelihood, nullParams); Console.WriteLine("Predictor\t" + scoreIndPredictor); Console.WriteLine("\n" + SpecialFunctions.CreateTabString("Variable", altParams.ToStringHeader(), "LogL")); messageInitializer = modelScorer.CreateMessageInitializer(null, altDistributionMap, AltModelDistribution); logLikelihood = modelScorer.ComputeLogLikelihoodModelGivenData(messageInitializer, altParams); scoreAlt = Score.GetInstance(logLikelihood, altParams); Console.WriteLine(SpecialFunctions.CreateTabString(AltModelDistribution, scoreAlt)); } } }
protected abstract string CreateReportLine( ModelScorer modelScorer, PhyloTree phyloTree, RowData rowAndTargetData, UniversalWorkList workList, int rowIndex, int workListCount, int workIndex);
protected override string CreateReportLine( ModelScorer modelScorer, PhyloTree phyloTree, RowData rowAndTargetData, UniversalWorkList workList, int rowIndex, int workListCount, int workIndex) { //!!!there is very similar code in ModelTesterDiscrete.cs Dictionary <string, string> row = rowAndTargetData.Row; string predictorVariable = row[PhyloTree.PredictorVariableColumnName]; string targetVariable = row[PhyloTree.TargetVariableColumnName]; // e.g. A@182 (amino acid "A" at position 182) int nullIndex = int.Parse(row[PhyloTree.NullIndexColumnName]); //Dictionary<string, bool> caseIdToNonNullPredictorValue = workList.NullIndexToPredictorToCaseIdToNonMissingValue[nullIndex][predictorVariable]; Dictionary <string, SufficientStatistics> caseIdToNonNullPredictorValue = rowAndTargetData.PredictorData; //workList.GetCaseIdToNonMissingValueForNullIndexAndPredictorVariable(nullIndex, predictorVariable); Dictionary <string, SufficientStatistics> caseIdToNonMissingTargetValue = rowAndTargetData.TargetData; Converter <Leaf, SufficientStatistics> targetDistributionMap = CreateSufficientStatisticsMap(caseIdToNonMissingTargetValue); Converter <Leaf, SufficientStatistics> predictorDistributionClassFunction = CreateSufficientStatisticsMap(caseIdToNonNullPredictorValue); int[] predictorCounts = phyloTree.CountsOfLeaves(predictorDistributionClassFunction); int predictorFalseNameCount = predictorCounts[(int)DistributionDiscreteBinary.DistributionClass.False]; int predictorTrueNameCount = predictorCounts[(int)DistributionDiscreteBinary.DistributionClass.True]; int targetNonMissingCount = phyloTree.CountOfNonMissingLeaves(caseIdToNonMissingTargetValue); int globalNonMissingCount = phyloTree.GlobalNonMissingCount(predictorDistributionClassFunction, targetDistributionMap); StringBuilder stringBuilder = new StringBuilder( SpecialFunctions.CreateTabString( this, rowIndex, workListCount, workIndex, nullIndex, predictorVariable, predictorFalseNameCount, predictorTrueNameCount, predictorTrueNameCount + predictorFalseNameCount, targetVariable, targetNonMissingCount, globalNonMissingCount, "")); bool ignoreRow = false; foreach (int count in predictorCounts) { if (count == 0) { ignoreRow = true; } } if (ignoreRow) { CompleteRowWithNaN(stringBuilder); } else { List <double> logLikelihoodList = new List <double>(); MessageInitializer messageInitializer = modelScorer.CreateMessageInitializer(predictorDistributionClassFunction, targetDistributionMap, NullModelDistribution); NullModelDistribution.InitialParamVals = null; foreach (bool useParameter in new bool[] { false, true }) { Score score = modelScorer.ScoreModel(messageInitializer, useParameter); stringBuilder.Append(SpecialFunctions.CreateTabString(score, "")); Debug.Write(SpecialFunctions.CreateTabString(score, "")); logLikelihoodList.Add(score.Loglikelihood); AltModelDistribution.InitialParamVals = score.OptimizationParameters; } double diff = logLikelihoodList[1] - logLikelihoodList[0]; double pValue = SpecialFunctions.LogLikelihoodRatioTest(Math.Max(diff, 0), ChiSquareDegreesOfFreedom); stringBuilder.Append(SpecialFunctions.CreateTabString(diff, pValue)); Debug.WriteLine(SpecialFunctions.CreateTabString(diff, pValue)); } return(stringBuilder.ToString()); }
//protected override NullDataCollection CreateNullDataGenerator(ModelScorer modelScorer, PhyloTree phyloTree, RangeCollection nullIndexRangeCollection, Dictionary<string, Dictionary<string, BooleanStatistics>> predictorVariableToCaseIdToRealNonMissingValue) //{ // if (DateTime.Now.Date == new DateTime(2006, 6, 28).Date) // for testing, force it to use the parametric bootstrap // { // return NullDataCollection.GetInstance( // new NullDataGeneratorAlongTree(modelScorer, phyloTree, (ModelTesterDiscrete)this), // nullIndexRangeCollection, // predictorVariableToCaseIdToRealNonMissingValue); // } // return base.CreateNullDataGenerator(modelScorer, phyloTree, nullIndexRangeCollection, predictorVariableToCaseIdToRealNonMissingValue); //} //public override Converter<Leaf, SufficientStatistics> CreateTargetSufficientStatisticsMap(Dictionary<string, ISufficientStatistics> caseIdToNonMissingValue) //{ // return ISufficientStatistics.DictionaryToLeafMap(caseIdToNonMissingValue); //} //public override Converter<Leaf, SufficientStatistics> CreatePredictorSufficientStatisticsMap(Dictionary<string, BooleanStatistics> caseIdToNonMissingValue) //{ // return CreateTargetSufficientStatisticsMap(caseIdToNonMissingValue); //} protected override string CreateReportLine( ModelScorer modelScorer, PhyloTree phyloTree, RowData rowAndTargetData, UniversalWorkList workList, int rowIndex, int workListCount, int workIndex) { //!!!there is very similar code in ModelTesterGaussian.cs // we're iterating over each predictor (e.g. hla), each target (e.g. position in the sequence, // and each possible substring at that position). // Then we ask the question, Does the presence of predictor (e.g. hla) // influence the probability that target (e.g. mer in position n1pos) will show up? // nullIndex specifies whether this is the true data or randomized data. Dictionary <string, string> row = rowAndTargetData.Row; string predictorVariable = row[PhyloTree.PredictorVariableColumnName]; // e.g. hla string targetVariable = row[PhyloTree.TargetVariableColumnName]; // e.g. A@182 (amino acid "A" at position 182) int nullIndex = int.Parse(row[PhyloTree.NullIndexColumnName]); //Dictionary<string, bool> caseIdToNonMissingPredictorValue = workList.NullIndexToPredictorToCaseIdToNonMissingValue[nullIndex][predictorVariable]; Dictionary <string, SufficientStatistics> caseIdToNonMissingPredictorValue = rowAndTargetData.PredictorData; //workList.GetCaseIdToNonMissingValueForNullIndexAndPredictorVariable(nullIndex, predictorVariable); Dictionary <string, SufficientStatistics> caseIdToNonMissingTargetValue = rowAndTargetData.TargetData; IEnumerator <SufficientStatistics> enumerator = caseIdToNonMissingPredictorValue.Values.GetEnumerator(); enumerator.MoveNext(); SufficientStatistics representative = enumerator.Current; bool predictorIsBoolean = representative is BooleanStatistics; Converter <Leaf, SufficientStatistics> targetDistributionClassFunction = CreateSufficientStatisticsMap(caseIdToNonMissingTargetValue); Converter <Leaf, SufficientStatistics> predictorDistributionClassFunction = CreateSufficientStatisticsMap(caseIdToNonMissingPredictorValue); int[] predictorCounts = predictorIsBoolean ? phyloTree.CountsOfLeaves(predictorDistributionClassFunction, NullModelDistribution) : new int[2]; int[] targetCounts = phyloTree.CountsOfLeaves(targetDistributionClassFunction, NullModelDistribution); int predictorFalseNameCount = predictorCounts[(int)DistributionDiscreteBinary.DistributionClass.False]; int predictorTrueNameCount = predictorCounts[(int)DistributionDiscreteBinary.DistributionClass.True]; int targetFalseNameCount = targetCounts[(int)DistributionDiscreteBinary.DistributionClass.False]; int targetTrueNameCount = targetCounts[(int)DistributionDiscreteBinary.DistributionClass.True]; int[] fisherCounts = predictorIsBoolean ? phyloTree.FisherCounts(predictorDistributionClassFunction, targetDistributionClassFunction) : new int[4]; int globalNonMissingCount = predictorIsBoolean ? fisherCounts[0] + fisherCounts[1] + fisherCounts[2] + fisherCounts[3] : phyloTree.GlobalNonMissingCount(predictorDistributionClassFunction, targetDistributionClassFunction); StringBuilder stringBuilder = new StringBuilder( SpecialFunctions.CreateTabString(this, rowIndex, workListCount, workIndex, nullIndex, predictorVariable, predictorFalseNameCount, predictorTrueNameCount, predictorTrueNameCount + predictorFalseNameCount, targetVariable, targetFalseNameCount, targetTrueNameCount, targetTrueNameCount + targetFalseNameCount, fisherCounts[0], fisherCounts[1], fisherCounts[2], fisherCounts[3], globalNonMissingCount, "")); bool ignoreRow = false; foreach (int[] counts in new int[][] { predictorIsBoolean?predictorCounts : new int[] { 1, 1 }, targetCounts }) { foreach (int count in counts) { if (count == 0) { ignoreRow = true; } } } if (ignoreRow) { CompleteRowWithNaN(stringBuilder); } else { double targetMarginal = (double)targetTrueNameCount / (double)(targetTrueNameCount + targetFalseNameCount); double predictorMarginal = (double)predictorTrueNameCount / (double)(predictorTrueNameCount + predictorFalseNameCount); double diff = ComputeLLR(modelScorer, phyloTree, stringBuilder, targetMarginal, predictorMarginal, predictorDistributionClassFunction, targetDistributionClassFunction); double pValue = SpecialFunctions.LogLikelihoodRatioTest(Math.Max(diff, 0), ChiSquareDegreesOfFreedom); stringBuilder.Append(SpecialFunctions.CreateTabString(diff, pValue)); } return(stringBuilder.ToString()); }