Esempio n. 1
0
        private void LoadUpPredictor(string modelName, int eLength, int ncLength, Converter <Hla, Hla> hlaForNormalization)
        {
            //Load up the predictor

            string featurerizerName;

            switch (modelName.ToLower())
            {
            //!!!would be better not to have multiple of these switch statements around - looks like a job for a Class
            case "lanliedb03062007":
                featurerizerName   = "[email protected]";
                SampleNEC          = NEC.GetInstance("", new string(' ', eLength), "");
                HlaFactory         = HlaFactory.GetFactory("MixedWithB15AndA68");
                SourceDataFileName = "lanlIedb03062007.pos.source.txt";
                NameList           = new string[] { "LANL", "IEDB" };
                break;

            default:
                SpecialFunctions.CheckCondition(false, "Don't know what featurerizer to use for the model");
                featurerizerName   = null;
                SourceDataFileName = null;
                NameList           = null;
                break;
            }
            Converter <object, Set <IHashableFeature> > featurizer = FeatureLib.CreateFeaturizer(featurerizerName);

            //GeneratorType generatorType = GeneratorType.ComboAndZero6SuperType;
            //FeatureSerializer featureSerializer = PositiveNegativeExperimentCollection.GetFeatureSerializer();
            //KmerDefinition = kmerDefinition;
            //HlaResolution hlaResolution = HlaResolution.ABMixed;
            string resourceName = string.Format("maxentModel{0}{1}{2}{3}.xml", modelName.Split('.')[0], SampleNEC.N.Length, SampleNEC.E.Length, SampleNEC.C.Length);

            EpitopeLearningDataList = new List <EpitopeLearningDataDupHlaOK>();
            using (StreamReader streamReader = Predictor.OpenResource(resourceName))
            {
                Logistic = (Logistic)FeatureLib.FeatureSerializer.FromXmlStreamReader(streamReader);
                //Logistic.FeatureGenerator = EpitopeFeatureGenerator.GetInstance(KmerDefinition, generatorType, featureSerializer).GenerateFeatureSet;
                Logistic.FeatureGenerator = FeatureLib.CreateFeaturizer(featurerizerName);
                foreach (string name in NameList)
                {
                    EpitopeLearningData epitopeLearningDataX = EpitopeLearningData.GetDbWhole(HlaFactory, SampleNEC.E.Length, name, SourceDataFileName);
                    Debug.Assert(epitopeLearningDataX.Count > 0, "Expect given data to have some data");
                    //!!!combine with previous step
                    EpitopeLearningDataDupHlaOK epitopeLearningData = new EpitopeLearningDataDupHlaOK(epitopeLearningDataX.Name);
                    foreach (KeyValuePair <Pair <string, Hla>, bool> merAndHlaAndLabel in epitopeLearningDataX)
                    {
                        Hla hlaIn  = merAndHlaAndLabel.Key.Second;
                        Hla hlaOut = hlaForNormalization(hlaIn);

                        Dictionary <Hla, Dictionary <Hla, bool> > hla2ToHlaToLabel = SpecialFunctions.GetValueOrDefault(epitopeLearningData, merAndHlaAndLabel.Key.First);
                        Dictionary <Hla, bool> hlaToLabel = SpecialFunctions.GetValueOrDefault(hla2ToHlaToLabel, hlaOut);
                        hlaToLabel.Add(hlaIn, merAndHlaAndLabel.Value);
                    }

                    EpitopeLearningDataList.Add(epitopeLearningData);
                }
            }

            HlaForNormalization = hlaForNormalization;
        }
        /* From [Microsoft Research]:
         *
         *  - I’ve changed two things wrt prior corrections.  First, I’m computing relative frequencies
         *    across length per HLA rather than per supertype (there was too much variation within
         *    supertype).  Second, the formula that I gave you last was not quite right in that it did not
         *    take into account the denominator of the prior odds term.  Given p_kh, the uncorrected
         *    probability of being an epitope according to the classifier for peptide of length k and
         *    HLA h, the correction is as follows:
         *
         *  log odds  := ln (p_kh/(1-p_kh))
         *  log odds := log odds + ln(  [relFreq_kh/0.25 * (1/100)] / [1 – relFreq_kh/0.25 * (1/100)] )
         *  pk_corrected = exp(log odds) / (1 + exp(log odds))
         *
         *  (Technical notes: In training, we are assuming a prior of 1/100 for each hla and k.
         *   In the data, the prior over hla is not uniform (e.g., there is lots of A02), but we think
         *   this is sampling bias.  That is, we think the prior on being an epitope is roughly'
         *   uniform for each hla.  But, the data is fairly unbiased wrt prior on epitope of length
         *   k reacting, given HLA.  That is, biologists were looking at particular HLAs, but they
         *   then found the optimal length for the epitope, giving an unbiased view of which lengths
         *   react with which HLAs.  Thus, for every HLA, we should correct the prior as a function
         *   of length.  We used to correct by supertype, but I’m seeing too much variation within
         *   a given supertype.  To help with smoothing, I’m using a Dirichlet(1,1,1,1) prior.
         *   Dividing each relFreq by 0.25 in the above formula guarantees that the overall prior is
         *   still 1/100.)
         *
         *
         *  From: [Microsoft Research]
         *  Sent: Thursday, July 27, 2006 4:25 PM
         *
         *
         *      As we discussed, I would like to write out the weight of evidence for the epitope rather
         *      than its posterior probability.  This is logOdds minus the prior (which is implicitly 1/100
         *      in our training data).
         *
         *      The formula for weight of evidence is (assuming 4 values of K, and 99 negatives per positive)
         *
         *          priorLogOddsOfThisLengthAndHla = LogOdds((relFreq/.25) * .01);
         *          originalLogOdds = LogOdds(originalP);
         *          correctedLogOdds = originalLogOdds + priorLogOddsOfThisLengthAndHla;
         *          weightofEvidence = correctedLogOdds – LogOdds(0.01);
         *
         */
        private void CreateKToHlaToPriorLogOdds()
        {
            KToHlaToPriorLogOdds = new Dictionary <int, Dictionary <Hla, double> >();
            _hlaSet = new Set <Hla>();
            HlaFactory hlaFactory = HlaFactory.GetFactory("MixedWithB15AndA68");

            _supertypeMap = new Dictionary <string, Set <Hla> >();


            Dictionary <Hla, Dictionary <int, int> > hlaToLengthToLengthToSmoothedCount = CreateHlaToLengthToLengthToSmoothedCount();

            foreach (Hla hla in hlaToLengthToLengthToSmoothedCount.Keys)
            {
                _hlaSet.AddNewOrOld(hla);

                Dictionary <int, int> lengthToSmoothedCount = hlaToLengthToLengthToSmoothedCount[hla];
                int smoothedTotal = ComputeSmoothedTotal(lengthToSmoothedCount);

                for (int k = (int)MerLength.firstLength; k <= (int)MerLength.lastLength; ++k)
                {
                    AddToHlaToPriorLogOdds(hla, lengthToSmoothedCount, smoothedTotal, k);
                }

                AddToSupertypeMap(hla);
            }

            AssertThatEveryKHasEveryHla();
        }