private void LoadUpPredictor(string modelName, int eLength, int ncLength, Converter <Hla, Hla> hlaForNormalization) { //Load up the predictor string featurerizerName; switch (modelName.ToLower()) { //!!!would be better not to have multiple of these switch statements around - looks like a job for a Class case "lanliedb03062007": featurerizerName = "[email protected]"; SampleNEC = NEC.GetInstance("", new string(' ', eLength), ""); HlaFactory = HlaFactory.GetFactory("MixedWithB15AndA68"); SourceDataFileName = "lanlIedb03062007.pos.source.txt"; NameList = new string[] { "LANL", "IEDB" }; break; default: SpecialFunctions.CheckCondition(false, "Don't know what featurerizer to use for the model"); featurerizerName = null; SourceDataFileName = null; NameList = null; break; } Converter <object, Set <IHashableFeature> > featurizer = FeatureLib.CreateFeaturizer(featurerizerName); //GeneratorType generatorType = GeneratorType.ComboAndZero6SuperType; //FeatureSerializer featureSerializer = PositiveNegativeExperimentCollection.GetFeatureSerializer(); //KmerDefinition = kmerDefinition; //HlaResolution hlaResolution = HlaResolution.ABMixed; string resourceName = string.Format("maxentModel{0}{1}{2}{3}.xml", modelName.Split('.')[0], SampleNEC.N.Length, SampleNEC.E.Length, SampleNEC.C.Length); EpitopeLearningDataList = new List <EpitopeLearningDataDupHlaOK>(); using (StreamReader streamReader = Predictor.OpenResource(resourceName)) { Logistic = (Logistic)FeatureLib.FeatureSerializer.FromXmlStreamReader(streamReader); //Logistic.FeatureGenerator = EpitopeFeatureGenerator.GetInstance(KmerDefinition, generatorType, featureSerializer).GenerateFeatureSet; Logistic.FeatureGenerator = FeatureLib.CreateFeaturizer(featurerizerName); foreach (string name in NameList) { EpitopeLearningData epitopeLearningDataX = EpitopeLearningData.GetDbWhole(HlaFactory, SampleNEC.E.Length, name, SourceDataFileName); Debug.Assert(epitopeLearningDataX.Count > 0, "Expect given data to have some data"); //!!!combine with previous step EpitopeLearningDataDupHlaOK epitopeLearningData = new EpitopeLearningDataDupHlaOK(epitopeLearningDataX.Name); foreach (KeyValuePair <Pair <string, Hla>, bool> merAndHlaAndLabel in epitopeLearningDataX) { Hla hlaIn = merAndHlaAndLabel.Key.Second; Hla hlaOut = hlaForNormalization(hlaIn); Dictionary <Hla, Dictionary <Hla, bool> > hla2ToHlaToLabel = SpecialFunctions.GetValueOrDefault(epitopeLearningData, merAndHlaAndLabel.Key.First); Dictionary <Hla, bool> hlaToLabel = SpecialFunctions.GetValueOrDefault(hla2ToHlaToLabel, hlaOut); hlaToLabel.Add(hlaIn, merAndHlaAndLabel.Value); } EpitopeLearningDataList.Add(epitopeLearningData); } } HlaForNormalization = hlaForNormalization; }
/* From [Microsoft Research]: * * - I’ve changed two things wrt prior corrections. First, I’m computing relative frequencies * across length per HLA rather than per supertype (there was too much variation within * supertype). Second, the formula that I gave you last was not quite right in that it did not * take into account the denominator of the prior odds term. Given p_kh, the uncorrected * probability of being an epitope according to the classifier for peptide of length k and * HLA h, the correction is as follows: * * log odds := ln (p_kh/(1-p_kh)) * log odds := log odds + ln( [relFreq_kh/0.25 * (1/100)] / [1 – relFreq_kh/0.25 * (1/100)] ) * pk_corrected = exp(log odds) / (1 + exp(log odds)) * * (Technical notes: In training, we are assuming a prior of 1/100 for each hla and k. * In the data, the prior over hla is not uniform (e.g., there is lots of A02), but we think * this is sampling bias. That is, we think the prior on being an epitope is roughly' * uniform for each hla. But, the data is fairly unbiased wrt prior on epitope of length * k reacting, given HLA. That is, biologists were looking at particular HLAs, but they * then found the optimal length for the epitope, giving an unbiased view of which lengths * react with which HLAs. Thus, for every HLA, we should correct the prior as a function * of length. We used to correct by supertype, but I’m seeing too much variation within * a given supertype. To help with smoothing, I’m using a Dirichlet(1,1,1,1) prior. * Dividing each relFreq by 0.25 in the above formula guarantees that the overall prior is * still 1/100.) * * * From: [Microsoft Research] * Sent: Thursday, July 27, 2006 4:25 PM * * * As we discussed, I would like to write out the weight of evidence for the epitope rather * than its posterior probability. This is logOdds minus the prior (which is implicitly 1/100 * in our training data). * * The formula for weight of evidence is (assuming 4 values of K, and 99 negatives per positive) * * priorLogOddsOfThisLengthAndHla = LogOdds((relFreq/.25) * .01); * originalLogOdds = LogOdds(originalP); * correctedLogOdds = originalLogOdds + priorLogOddsOfThisLengthAndHla; * weightofEvidence = correctedLogOdds – LogOdds(0.01); * */ private void CreateKToHlaToPriorLogOdds() { KToHlaToPriorLogOdds = new Dictionary <int, Dictionary <Hla, double> >(); _hlaSet = new Set <Hla>(); HlaFactory hlaFactory = HlaFactory.GetFactory("MixedWithB15AndA68"); _supertypeMap = new Dictionary <string, Set <Hla> >(); Dictionary <Hla, Dictionary <int, int> > hlaToLengthToLengthToSmoothedCount = CreateHlaToLengthToLengthToSmoothedCount(); foreach (Hla hla in hlaToLengthToLengthToSmoothedCount.Keys) { _hlaSet.AddNewOrOld(hla); Dictionary <int, int> lengthToSmoothedCount = hlaToLengthToLengthToSmoothedCount[hla]; int smoothedTotal = ComputeSmoothedTotal(lengthToSmoothedCount); for (int k = (int)MerLength.firstLength; k <= (int)MerLength.lastLength; ++k) { AddToHlaToPriorLogOdds(hla, lengthToSmoothedCount, smoothedTotal, k); } AddToSupertypeMap(hla); } AssertThatEveryKHasEveryHla(); }