private void LoadUpPredictor(string modelName, int eLength, int ncLength, Converter <Hla, Hla> hlaForNormalization) { //Load up the predictor string featurerizerName; switch (modelName.ToLower()) { //!!!would be better not to have multiple of these switch statements around - looks like a job for a Class case "lanliedb03062007": featurerizerName = "[email protected]"; SampleNEC = NEC.GetInstance("", new string(' ', eLength), ""); HlaFactory = HlaFactory.GetFactory("MixedWithB15AndA68"); SourceDataFileName = "lanlIedb03062007.pos.source.txt"; NameList = new string[] { "LANL", "IEDB" }; break; default: SpecialFunctions.CheckCondition(false, "Don't know what featurerizer to use for the model"); featurerizerName = null; SourceDataFileName = null; NameList = null; break; } Converter <object, Set <IHashableFeature> > featurizer = FeatureLib.CreateFeaturizer(featurerizerName); //GeneratorType generatorType = GeneratorType.ComboAndZero6SuperType; //FeatureSerializer featureSerializer = PositiveNegativeExperimentCollection.GetFeatureSerializer(); //KmerDefinition = kmerDefinition; //HlaResolution hlaResolution = HlaResolution.ABMixed; string resourceName = string.Format("maxentModel{0}{1}{2}{3}.xml", modelName.Split('.')[0], SampleNEC.N.Length, SampleNEC.E.Length, SampleNEC.C.Length); EpitopeLearningDataList = new List <EpitopeLearningDataDupHlaOK>(); using (StreamReader streamReader = Predictor.OpenResource(resourceName)) { Logistic = (Logistic)FeatureLib.FeatureSerializer.FromXmlStreamReader(streamReader); //Logistic.FeatureGenerator = EpitopeFeatureGenerator.GetInstance(KmerDefinition, generatorType, featureSerializer).GenerateFeatureSet; Logistic.FeatureGenerator = FeatureLib.CreateFeaturizer(featurerizerName); foreach (string name in NameList) { EpitopeLearningData epitopeLearningDataX = EpitopeLearningData.GetDbWhole(HlaFactory, SampleNEC.E.Length, name, SourceDataFileName); Debug.Assert(epitopeLearningDataX.Count > 0, "Expect given data to have some data"); //!!!combine with previous step EpitopeLearningDataDupHlaOK epitopeLearningData = new EpitopeLearningDataDupHlaOK(epitopeLearningDataX.Name); foreach (KeyValuePair <Pair <string, Hla>, bool> merAndHlaAndLabel in epitopeLearningDataX) { Hla hlaIn = merAndHlaAndLabel.Key.Second; Hla hlaOut = hlaForNormalization(hlaIn); Dictionary <Hla, Dictionary <Hla, bool> > hla2ToHlaToLabel = SpecialFunctions.GetValueOrDefault(epitopeLearningData, merAndHlaAndLabel.Key.First); Dictionary <Hla, bool> hlaToLabel = SpecialFunctions.GetValueOrDefault(hla2ToHlaToLabel, hlaOut); hlaToLabel.Add(hlaIn, merAndHlaAndLabel.Value); } EpitopeLearningDataList.Add(epitopeLearningData); } } HlaForNormalization = hlaForNormalization; }
//!!!very similar to other code public static Dictionary <Pair <NEC, Hla>, bool> ReadTable(HlaFactory hlaFactory, string fileName, bool dedup) { Dictionary <Pair <NEC, Hla>, bool> table = new Dictionary <Pair <NEC, Hla>, bool>(); foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(fileName, "N\tepitope\tC\thla\tlabel", false)) { string n = row["N"]; string epitope = row["epitope"]; SpecialFunctions.CheckCondition(Biology.GetInstance().LegalPeptide(epitope), string.Format("Peptide, '{0}', contains illegal char.", epitope)); string c = row["C"]; NEC nec = NEC.GetInstance(n, epitope, c); Hla hla = hlaFactory.GetGroundInstance(row["hla"]); string labelString = row["label"]; SpecialFunctions.CheckCondition(labelString == "0" || labelString == "1", "Expect label to be '0' or '1'"); Pair <NEC, Hla> pair = new Pair <NEC, Hla>(nec, hla); bool labelAsBool = (labelString == "1"); if (dedup && table.ContainsKey(pair)) { SpecialFunctions.CheckCondition(table[pair] == labelAsBool, "The example " + pair.ToString() + " appears with contradictory labels."); continue; } table.Add(pair, labelAsBool); } return(table); }
/* From [Microsoft Research]: * * - I’ve changed two things wrt prior corrections. First, I’m computing relative frequencies * across length per HLA rather than per supertype (there was too much variation within * supertype). Second, the formula that I gave you last was not quite right in that it did not * take into account the denominator of the prior odds term. Given p_kh, the uncorrected * probability of being an epitope according to the classifier for peptide of length k and * HLA h, the correction is as follows: * * log odds := ln (p_kh/(1-p_kh)) * log odds := log odds + ln( [relFreq_kh/0.25 * (1/100)] / [1 – relFreq_kh/0.25 * (1/100)] ) * pk_corrected = exp(log odds) / (1 + exp(log odds)) * * (Technical notes: In training, we are assuming a prior of 1/100 for each hla and k. * In the data, the prior over hla is not uniform (e.g., there is lots of A02), but we think * this is sampling bias. That is, we think the prior on being an epitope is roughly' * uniform for each hla. But, the data is fairly unbiased wrt prior on epitope of length * k reacting, given HLA. That is, biologists were looking at particular HLAs, but they * then found the optimal length for the epitope, giving an unbiased view of which lengths * react with which HLAs. Thus, for every HLA, we should correct the prior as a function * of length. We used to correct by supertype, but I’m seeing too much variation within * a given supertype. To help with smoothing, I’m using a Dirichlet(1,1,1,1) prior. * Dividing each relFreq by 0.25 in the above formula guarantees that the overall prior is * still 1/100.) * * * From: [Microsoft Research] * Sent: Thursday, July 27, 2006 4:25 PM * * * As we discussed, I would like to write out the weight of evidence for the epitope rather * than its posterior probability. This is logOdds minus the prior (which is implicitly 1/100 * in our training data). * * The formula for weight of evidence is (assuming 4 values of K, and 99 negatives per positive) * * priorLogOddsOfThisLengthAndHla = LogOdds((relFreq/.25) * .01); * originalLogOdds = LogOdds(originalP); * correctedLogOdds = originalLogOdds + priorLogOddsOfThisLengthAndHla; * weightofEvidence = correctedLogOdds – LogOdds(0.01); * */ private void CreateKToHlaToPriorLogOdds() { KToHlaToPriorLogOdds = new Dictionary <int, Dictionary <Hla, double> >(); _hlaSet = new Set <Hla>(); HlaFactory hlaFactory = HlaFactory.GetFactory("MixedWithB15AndA68"); _supertypeMap = new Dictionary <string, Set <Hla> >(); Dictionary <Hla, Dictionary <int, int> > hlaToLengthToLengthToSmoothedCount = CreateHlaToLengthToLengthToSmoothedCount(); foreach (Hla hla in hlaToLengthToLengthToSmoothedCount.Keys) { _hlaSet.AddNewOrOld(hla); Dictionary <int, int> lengthToSmoothedCount = hlaToLengthToLengthToSmoothedCount[hla]; int smoothedTotal = ComputeSmoothedTotal(lengthToSmoothedCount); for (int k = (int)MerLength.firstLength; k <= (int)MerLength.lastLength; ++k) { AddToHlaToPriorLogOdds(hla, lengthToSmoothedCount, smoothedTotal, k); } AddToSupertypeMap(hla); } AssertThatEveryKHasEveryHla(); }
public double Predict(List <Dictionary <string, string> > patientTable, NEC nec, bool modelOnly) { double predictedPTotal = 0.0; foreach (Dictionary <string, string> patientRow in patientTable) { double product = 1.0; foreach (KeyValuePair <string, string> columnAndValue in patientRow) { Hla hla = HlaFactory.GetGroundInstance(columnAndValue.Key.Substring(0, 1) + columnAndValue.Value); Debug.Assert(nec.N.Length == SampleNEC.N.Length && nec.E.Length == SampleNEC.E.Length && nec.C.Length == SampleNEC.C.Length); // real assert string sourceIgnore; double probability = Predict(nec, hla, modelOnly, out sourceIgnore); product *= 1.0 - probability; } double noiseyOrForThisPatient = 1.0 - product; predictedPTotal += noiseyOrForThisPatient; } double predictedP = predictedPTotal / (double)patientTable.Count; return(predictedP); }
//internal EpitopeLearningData[] Split(int cCrossValPart, Random aRandom) //{ // List<KeyValuePair<MerAndHlaToLength, bool>> shuffleList = new List<KeyValuePair<MerAndHlaToLength, bool>>(); // foreach (KeyValuePair<Pair<string, Hla>, bool> merAndHlaToLengthWithLabel in this) // { // shuffleList.Add(merAndHlaToLengthWithLabel); // int iRandomPos = aRandom.Next(shuffleList.Count); // shuffleList[shuffleList.Count - 1] = shuffleList[iRandomPos]; // shuffleList[iRandomPos] = merAndHlaToLengthWithLabel; // } // EpitopeLearningData[] rgrg = new EpitopeLearningData[cCrossValPart]; // for (int irgrg = 0; irgrg < rgrg.Length; ++irgrg) // { // rgrg[irgrg] = new EpitopeLearningData(string.Format("{0}{1}", Name, irgrg)); // } // for (int iShuffleList = 0; iShuffleList < shuffleList.Count; ++iShuffleList) // { // KeyValuePair<MerAndHlaToLength, bool> merAndHlaToLengthWithLabel = shuffleList[iShuffleList]; // int iSet = iShuffleList * cCrossValPart / shuffleList.Count; // rgrg[iSet].Add(merAndHlaToLengthWithLabel.Key, merAndHlaToLengthWithLabel.Value); // } // return rgrg; //} public static EpitopeLearningData GetDbWhole(HlaFactory hlaFactory, int eLength, string datasetName, string fileOrResourceName) { Set <string> wantedSet = CreateSourceSet(datasetName); EpitopeLearningData rg = new EpitopeLearningData(datasetName); //SpecialFunctions.CheckCondition(hlaResolution.Equals(HlaResolution.ABMixed)); foreach (Dictionary <string, string> row in Predictor.TabFileTable(fileOrResourceName, "peptide hla source label", false)) { string peptide = row["peptide"]; SpecialFunctions.CheckCondition(Biology.GetInstance().LegalPeptide(peptide), string.Format("Peptide, '{0}', contains illegal char.", peptide)); if (peptide.Length != eLength) //!!!const { continue; } string source = row["source"]; Set <string> providedSet = CreateSourceSet(source); //Debug.Assert(providedSet.IsSubsetOf(Set<string>.GetInstance(new string[] { "Aplus", "LANL", "IEDB" }))); // real assert if (providedSet.IntersectionIsEmpty(wantedSet)) { continue; } Hla hla = hlaFactory.GetGroundInstance(row["hla"]); //HlaToLength hlaToLength = HlaToLength.GetInstance(hla, hlaResolution); Pair <string, Hla> peptideAndHla = new Pair <string, Hla>(peptide, hla); //MerAndHlaToLength aMerAndHlaToLength = MerAndHlaToLength.GetInstance(peptide, hlaToLength, kmerDefinition); string label = row["label"]; SpecialFunctions.CheckCondition(label == "0" || label == "1", string.Format("Warning: Epitope example {0} has unknown label {1} and will be ignored.", peptideAndHla, label)); rg[peptideAndHla] = (label == "1"); } return(rg); }