/// <summary> /// computes the connected components of this graph (only components with >= 2 elements are returned) /// </summary> /// <returns>the connected components of this graph (only components with >= 2 elements are returned) /// in a Dictionary. The key corresponds to the node.Name and the Value is the set, in which the corresponding node is in. /// </returns> public Dictionary <string, Set <GraphNode> > GetConnectedComponentsInDictionary() { IEnumerable <string> fromLabels = (from edge in Edges select edge.From).Distinct(); List <string> labels = (from edge in Edges select edge.To).Distinct().ToList(); labels.AddRange(fromLabels); IEnumerable <GraphNode> connectedNodes = from node in Nodes where labels.Contains(node.Name) select node; Dictionary <string, Set <GraphNode> > connectedComponents = new Dictionary <string, Set <GraphNode> >(); // at the beginning each node is in its own component foreach (GraphNode node in connectedNodes) { connectedComponents.Add(node.Name, new Set <GraphNode>(node)); } Set <GraphNode> set1 = null; Set <GraphNode> set2 = null; Set <GraphNode> setToAdd = null; Set <GraphNode> set = null; // for each edge we join the components if the corresponding nodes are not already in the same component foreach (DirectedGraphEdge edge in Edges) { if (!Object.ReferenceEquals(connectedComponents[edge.From], connectedComponents[edge.To])) { set1 = connectedComponents[edge.From]; set2 = connectedComponents[edge.To]; // add smaller set to bigger set if (set1.Count > set2.Count) { set = set1; setToAdd = set2; } else { set = set2; setToAdd = set1; } set.AddNewRange(setToAdd); // update references in added set foreach (GraphNode tempNode in setToAdd) { connectedComponents[tempNode.Name] = set; } } } return(connectedComponents); }
//static private Dictionary<Pair<NEC, Hla>, bool> CloseHuman = null; //GeneratorType.Hla | GeneratorType.Position | GeneratorType.Property | GeneratorType.AndHla | GeneratorType.Zero6Supertype | GeneratorType.AndZero6Supertype private static Set <IHashableFeature> GenerateFeatureSet( object entity, string supertypeTableSource, int?flankSizeOrNull, bool includeFlankNECFeatures, bool includeChemicalProperties, bool includeAAFeatures, bool addEiFeatures ) { bool includeAndHlaAndSTWithEpitopeAdjFeatures = false; bool subtractSupertypeFeatures = false; bool subtractHlaFeatures = false; bool substractChemAACrissCrossFeatures = false; SpecialFunctions.CheckCondition(!includeAndHlaAndSTWithEpitopeAdjFeatures || includeFlankNECFeatures); Pair <NEC, Hla> necAndHlaX = (Pair <NEC, Hla>)entity; NEC nec = (null == flankSizeOrNull) ? necAndHlaX.First : NEC.GetInstance(necAndHlaX.First, (int)flankSizeOrNull); Hla hla = necAndHlaX.Second; Debug.Assert(nec.N.Length == nec.C.Length); // real assert Pair <NEC, Hla> necAndHla = new Pair <NEC, Hla>(nec, hla); Set <IHashableFeature> hlaishFeatureSet = new Set <IHashableFeature>(); CreateAndAddHlaFeature(subtractHlaFeatures, hla, necAndHla, ref hlaishFeatureSet); CreateAndAddFeatureSupertype(supertypeTableSource, subtractSupertypeFeatures, hla, necAndHla, ref hlaishFeatureSet, Assembly.GetExecutingAssembly(), Predictor.ResourceString); Set <IHashableFeature> featureSet = Set <IHashableFeature> .GetInstance(hlaishFeatureSet); if (addEiFeatures) { AddEiFeatures(includeChemicalProperties, includeAAFeatures, substractChemAACrissCrossFeatures, nec, necAndHla, hlaishFeatureSet, featureSet); } if (includeFlankNECFeatures) { List <IHashableFeature> aaInNFlankFeatureList = new List <IHashableFeature>(In.GetAASeqInRegionInstance(1, necAndHla, NFlank.GetInstance())); DebugCheckThatEvaluatesToTrue(necAndHla, aaInNFlankFeatureList); if (includeAAFeatures) { featureSet.AddNewRange(aaInNFlankFeatureList); //AA in N flank featureSet.AddNewRange(In.GetAASeqInRegionInstance(2, necAndHla, NFlank.GetInstance())); //AA1-AA2 in Nflank featureSet.AddNewRange(SubSeq.GetInSubSeqEnumeration(NFlank.GetInstance(), false, 1, necAndHla)); //AA@x in N flank (numbering is 5 4 3 2 1) featureSet.AddNewRange(SubSeq.GetInSubSeqEnumeration(NFlank.GetInstance(), false, 2, necAndHla)); //AA1-AA2@x in Nflank (x is position of AA2, i.e., the smaller number) featureSet.AddNewRange(In.GetAASeqInRegionInstance(1, necAndHla, CFlank.GetInstance())); //AA in Cflank featureSet.AddNewRange(In.GetAASeqInRegionInstance(2, necAndHla, CFlank.GetInstance())); //AA1-AA2 in Cflank featureSet.AddNewRange(SubSeq.GetInSubSeqEnumeration(CFlank.GetInstance(), true, 1, necAndHla)); //AA@x in C flank (numbering is 1 2 3 4 5) featureSet.AddNewRange(SubSeq.GetInSubSeqEnumeration(CFlank.GetInstance(), true, 2, necAndHla)); //AA1-AA2@x in Cflank (x is position of AA1, i.e., the smaller number) } if (includeChemicalProperties) { featureSet.AddNewOrOldRange(InProperty.GetPropertySeqInRegionInstance(1, necAndHla, NFlank.GetInstance())); featureSet.AddNewOrOldRange(SubSeq.GetInPropertySubSeqEnumeration(NFlank.GetInstance(), false, 1, necAndHla)); featureSet.AddNewOrOldRange(InProperty.GetPropertySeqInRegionInstance(1, necAndHla, CFlank.GetInstance())); featureSet.AddNewOrOldRange(SubSeq.GetInPropertySubSeqEnumeration(CFlank.GetInstance(), true, 1, necAndHla)); featureSet.AddNewOrOldRange(InProperty.GetPropertySeqInRegionInstance(2, necAndHla, NFlank.GetInstance())); featureSet.AddNewOrOldRange(SubSeq.GetInPropertySubSeqEnumeration(NFlank.GetInstance(), false, 2, necAndHla)); featureSet.AddNewOrOldRange(InProperty.GetPropertySeqInRegionInstance(2, necAndHla, CFlank.GetInstance())); featureSet.AddNewOrOldRange(SubSeq.GetInPropertySubSeqEnumeration(CFlank.GetInstance(), true, 2, necAndHla)); } } if (includeFlankNECFeatures) { if (includeAAFeatures) { //EV in Epitope AddFeatureWithOptionalAndHlaAndST(In.GetAASeqInRegionInstance(2, necAndHla, Epitope.GetInstance()), includeAndHlaAndSTWithEpitopeAdjFeatures, hlaishFeatureSet, false, ref featureSet);//AA1-AA2 in Epitope //RR in Epitope[@1-2] AddFeatureWithOptionalAndHlaAndST(SubSeq.GetInSubSeqEnumeration(Epitope.GetInstance(), true, 2, necAndHla), includeAndHlaAndSTWithEpitopeAdjFeatures, hlaishFeatureSet, false, ref featureSet);//AA1-AA2@x in Epitope (x is position of AA1, i.e., the smaller number) } if (includeChemicalProperties) { //polar,cyclic in Epitope AddFeatureWithOptionalAndHlaAndST(InProperty.GetPropertySeqInRegionInstance(2, necAndHla, Epitope.GetInstance()), includeAndHlaAndSTWithEpitopeAdjFeatures, hlaishFeatureSet, false, ref featureSet); //polar,large in Epitope[@8-9] AddFeatureWithOptionalAndHlaAndST(SubSeq.GetInPropertySubSeqEnumeration(Epitope.GetInstance(), true, 2, necAndHla), includeAndHlaAndSTWithEpitopeAdjFeatures, hlaishFeatureSet, false, ref featureSet); } //AA1-AA2 in Nflank,Epitope, etc if (null != flankSizeOrNull && (int)flankSizeOrNull > 0) { string epitope = (string)Epitope.GetInstance().Evaluate(entity); SubSeq lastNAAFeature = SubSeq.GetInstance(1, 1, false, NFlank.GetInstance()); string lastNAA = (string)lastNAAFeature.Evaluate(entity); In inLastNAA = In.GetInstance(lastNAA, lastNAAFeature); SubSeq firstEAAFeature = SubSeq.GetInstance(1, 1, true, Epitope.GetInstance()); string firstEAA = (string)firstEAAFeature.Evaluate(entity); Debug.Assert(firstEAA == epitope.Substring(0, 1));// real assert In inFirstEAA = In.GetInstance(firstEAA, firstEAAFeature); SubSeq lastEAAFeature = SubSeq.GetInstance(epitope.Length, epitope.Length, true, Epitope.GetInstance()); string lastEAA = (string)lastEAAFeature.Evaluate(entity); In inLastEAA = In.GetInstance(lastEAA, lastEAAFeature); SubSeq firstCAAFeature = SubSeq.GetInstance(1, 1, true, CFlank.GetInstance()); string firstCAA = (string)firstCAAFeature.Evaluate(entity); In inFirstCAA = In.GetInstance(firstCAA, firstCAAFeature); if (includeAAFeatures) { And andLastNNAAFirstEAA = And.GetInstance(inLastNAA, inFirstEAA); AddFeatureWithOptionalAndHlaAndST(andLastNNAAFirstEAA, includeAndHlaAndSTWithEpitopeAdjFeatures, hlaishFeatureSet, /*checkThatNew*/ true, ref featureSet); And andLastEAAFirstCAA = And.GetInstance(inLastEAA, inFirstCAA); AddFeatureWithOptionalAndHlaAndST(andLastEAAFirstCAA, includeAndHlaAndSTWithEpitopeAdjFeatures, hlaishFeatureSet, /*checkThatNew*/ true, ref featureSet); } if (includeChemicalProperties) { foreach (string lastNProperty in KmerProperties.AaToPropList[Biology.GetInstance().OneLetterAminoAcidAbbrevTo3Letter[lastNAA[0]]]) { InProperty inLastNProperty = InProperty.GetInstance(lastNProperty, lastNAAFeature); foreach (string firstEProperty in KmerProperties.AaToPropList[Biology.GetInstance().OneLetterAminoAcidAbbrevTo3Letter[firstEAA[0]]]) { InProperty inFirstEProperty = InProperty.GetInstance(firstEProperty, firstEAAFeature); //!!!get this out of the loop? And andLastNPropertyFirstEProperty = And.GetInstance(inLastNProperty, inFirstEProperty); AddFeatureWithOptionalAndHlaAndST(andLastNPropertyFirstEProperty, includeAndHlaAndSTWithEpitopeAdjFeatures, hlaishFeatureSet, /*checkThatNew*/ false, ref featureSet); Debug.Assert((bool)andLastNPropertyFirstEProperty.Evaluate(necAndHla)); } } foreach (string lastEProperty in KmerProperties.AaToPropList[Biology.GetInstance().OneLetterAminoAcidAbbrevTo3Letter[lastEAA[0]]]) { InProperty inlastEProperty = InProperty.GetInstance(lastEProperty, lastEAAFeature); foreach (string firstCProperty in KmerProperties.AaToPropList[Biology.GetInstance().OneLetterAminoAcidAbbrevTo3Letter[firstCAA[0]]]) { InProperty infirstCProperty = InProperty.GetInstance(firstCProperty, firstCAAFeature); //!!!get this out of the loop? And andlastEPropertyfirstCProperty = And.GetInstance(inlastEProperty, infirstCProperty); AddFeatureWithOptionalAndHlaAndST(andlastEPropertyfirstCProperty, includeAndHlaAndSTWithEpitopeAdjFeatures, hlaishFeatureSet, /*checkThatNew*/ false, ref featureSet); Debug.Assert((bool)andlastEPropertyfirstCProperty.Evaluate(necAndHla)); } } } } } return(featureSet); }
public override EvaluationResults EvaluateModelOnData(Converter <Leaf, SufficientStatistics> v1, Converter <Leaf, SufficientStatistics> v2) { List <Leaf> nonMissingLeaves = new List <Leaf>(100); int seed = 0; foreach (Leaf leaf in ModelScorer.PhyloTree.LeafCollection) { SufficientStatistics class1 = v1(leaf); SufficientStatistics class2 = v2(leaf); if (!class1.IsMissing() && !class2.IsMissing()) { nonMissingLeaves.Add(leaf); seed ^= (leaf.CaseName + class1.ToString() + class2.ToString()).GetHashCode(); } } Random rand = new Random(seed); nonMissingLeaves = SpecialFunctions.Shuffle(nonMissingLeaves, ref rand); int groupSize = nonMissingLeaves.Count / _crossValidateCount; EvaluationResultsCrossValidate combinedResults = null; double testAltLLSum = 0; // for debugging double testNullLLSum = 0; // for debugging for (int i = 0; i < _crossValidateCount; i++) { int testStart = i * groupSize; int trainStart = testStart + groupSize; Set <Leaf> trainSet = new Set <Leaf>(SpecialFunctions.SubList(nonMissingLeaves, trainStart, nonMissingLeaves.Count - trainStart)); trainSet.AddNewRange(SpecialFunctions.SubList(nonMissingLeaves, 0, testStart)); Converter <Leaf, SufficientStatistics> v1Train = CreateFilteredMap(v1, trainSet); Converter <Leaf, SufficientStatistics> v2Train = CreateFilteredMap(v2, trainSet); EvaluationResults trainingResults = InternalEvaluator.EvaluateModelOnData(v1Train, v2Train); EvaluationResults testAndTrainResult = InternalEvaluator.EvaluateModelOnDataGivenParams(v1, v2, trainingResults); EvaluationResultsTestGivenTrain testGivenTrainResult = EvaluationResultsTestGivenTrain.GetInstance(this, trainingResults, testAndTrainResult); if (combinedResults == null) { combinedResults = EvaluationResultsCrossValidate.GetInstance(this, testGivenTrainResult); } else { combinedResults = combinedResults.AddNewResults(testGivenTrainResult); } if (double.IsInfinity(combinedResults.AltLL)) // no point in continuing...infinity will kill everything. { break; } #if DEBUG double eps = 1E-10; EvaluationResults testTrainingResults = InternalEvaluator.EvaluateModelOnDataGivenParams(v1Train, v2Train, trainingResults); Debug.Assert(ComplexNumber.ApproxEqual(testTrainingResults.AltLL, trainingResults.AltLL, eps) && ComplexNumber.ApproxEqual(testTrainingResults.NullLL, trainingResults.NullLL, eps)); //Debug.Assert(testTrainingResults.Equals(trainingResults)); double newNullLL = testAndTrainResult.NullLL - trainingResults.NullLL; double newAltLL = testAndTrainResult.AltLL - trainingResults.AltLL; Debug.Assert(ComplexNumber.ApproxEqual(newNullLL, testGivenTrainResult.NullLL, eps)); Debug.Assert(ComplexNumber.ApproxEqual(newAltLL, testGivenTrainResult.AltLL, eps)); testNullLLSum += newNullLL; testAltLLSum += newAltLL; Debug.Assert(ComplexNumber.ApproxEqual(testNullLLSum, combinedResults.NullLL, eps), "Combined result has wrong NullLL"); Debug.Assert(ComplexNumber.ApproxEqual(testAltLLSum, combinedResults.AltLL, eps), "Combined result has wrong AltLL"); #endif } return(combinedResults); }
//Similar to the other tabulators, but can work with multiple sets of pValues files //!!!would be better if could cut off really bad pValues to save memory //!!! also would be nice to have filters public static void CreateTabulateReport(ICollection <string> inputFilePatternCollection, string outputFileName, KeepTest <Dictionary <string, string> > keepTest, double maxPValue, bool auditRowIndexValues) { //SpecialFunctions.CheckCondition(!File.Exists(outputFileName), "Output file already exists: " + outputFileName); using (TextWriter textWriter = File.CreateText(outputFileName)) // Do this early so that if it fails, well know { List <Dictionary <string, string> > realRowCollectionToSort = new List <Dictionary <string, string> >(); List <double> nullValueCollectionToBeSorted = new List <double>(); string headerSoFar = null; Set <int> broadRealAndNullIndexSetSoFar = null; foreach (string broadInputFilePattern in inputFilePatternCollection) { Set <int> narrowRealAndNullIndexSetSetSoFar = Set <int> .GetInstance(); foreach (string narrowInputFilePattern in broadInputFilePattern.Split('+')) { Set <int> realAndNullIndexSet = CreateTabulateReportInternal(narrowInputFilePattern, keepTest, maxPValue, auditRowIndexValues, ref realRowCollectionToSort, ref nullValueCollectionToBeSorted, ref headerSoFar); //Instead of throwing an error, we could filter out the duplicated null indexes SpecialFunctions.CheckCondition(narrowRealAndNullIndexSetSetSoFar.IntersectionIsEmpty(realAndNullIndexSet), string.Format("Within inputFilePattern {0}, multiple '+'-connected parts cover the same nullIndex(s), {1}", broadInputFilePattern, narrowRealAndNullIndexSetSetSoFar.Intersection(realAndNullIndexSet))); narrowRealAndNullIndexSetSetSoFar.AddNewRange(realAndNullIndexSet); } SpecialFunctions.CheckCondition(!auditRowIndexValues || narrowRealAndNullIndexSetSetSoFar.Contains(-1), string.Format("The 'null' index -1 for the real data was not seen in {0}", broadInputFilePattern)); if (broadRealAndNullIndexSetSoFar == null) { broadRealAndNullIndexSetSoFar = narrowRealAndNullIndexSetSetSoFar; } else { SpecialFunctions.CheckCondition(broadRealAndNullIndexSetSoFar.Equals(narrowRealAndNullIndexSetSetSoFar), string.Format("The broad inputFilePattern {0} covers a different set of nullIndexes ({1}) than its predecessors ({2})", broadInputFilePattern, narrowRealAndNullIndexSetSetSoFar, broadRealAndNullIndexSetSoFar)); } } double numberOfRandomizationRuns = broadRealAndNullIndexSetSoFar.Count - 1; Console.WriteLine("Detected {0} randomized runs relative to the number of real runs.", numberOfRandomizationRuns); Dictionary <Dictionary <string, string>, double> qValueList = SpecialFunctions.ComputeQValues(ref realRowCollectionToSort, AccessPValueFromPhylotreeRow, ref nullValueCollectionToBeSorted, numberOfRandomizationRuns); //!!!this code is repeated elsewhere textWriter.WriteLine(SpecialFunctions.CreateTabString(headerSoFar, "qValue")); foreach (Dictionary <string, string> row in realRowCollectionToSort) { double qValue = qValueList[row]; textWriter.WriteLine(SpecialFunctions.CreateTabString(row[""], qValue)); } } }