コード例 #1
0
        private void LoadUpPredictor(string modelName, int eLength, int ncLength, Converter <Hla, Hla> hlaForNormalization)
        {
            //Load up the predictor

            string featurerizerName;

            switch (modelName.ToLower())
            {
            //!!!would be better not to have multiple of these switch statements around - looks like a job for a Class
            case "lanliedb03062007":
                featurerizerName   = "[email protected]";
                SampleNEC          = NEC.GetInstance("", new string(' ', eLength), "");
                HlaFactory         = HlaFactory.GetFactory("MixedWithB15AndA68");
                SourceDataFileName = "lanlIedb03062007.pos.source.txt";
                NameList           = new string[] { "LANL", "IEDB" };
                break;

            default:
                SpecialFunctions.CheckCondition(false, "Don't know what featurerizer to use for the model");
                featurerizerName   = null;
                SourceDataFileName = null;
                NameList           = null;
                break;
            }
            Converter <object, Set <IHashableFeature> > featurizer = FeatureLib.CreateFeaturizer(featurerizerName);

            //GeneratorType generatorType = GeneratorType.ComboAndZero6SuperType;
            //FeatureSerializer featureSerializer = PositiveNegativeExperimentCollection.GetFeatureSerializer();
            //KmerDefinition = kmerDefinition;
            //HlaResolution hlaResolution = HlaResolution.ABMixed;
            string resourceName = string.Format("maxentModel{0}{1}{2}{3}.xml", modelName.Split('.')[0], SampleNEC.N.Length, SampleNEC.E.Length, SampleNEC.C.Length);

            EpitopeLearningDataList = new List <EpitopeLearningDataDupHlaOK>();
            using (StreamReader streamReader = Predictor.OpenResource(resourceName))
            {
                Logistic = (Logistic)FeatureLib.FeatureSerializer.FromXmlStreamReader(streamReader);
                //Logistic.FeatureGenerator = EpitopeFeatureGenerator.GetInstance(KmerDefinition, generatorType, featureSerializer).GenerateFeatureSet;
                Logistic.FeatureGenerator = FeatureLib.CreateFeaturizer(featurerizerName);
                foreach (string name in NameList)
                {
                    EpitopeLearningData epitopeLearningDataX = EpitopeLearningData.GetDbWhole(HlaFactory, SampleNEC.E.Length, name, SourceDataFileName);
                    Debug.Assert(epitopeLearningDataX.Count > 0, "Expect given data to have some data");
                    //!!!combine with previous step
                    EpitopeLearningDataDupHlaOK epitopeLearningData = new EpitopeLearningDataDupHlaOK(epitopeLearningDataX.Name);
                    foreach (KeyValuePair <Pair <string, Hla>, bool> merAndHlaAndLabel in epitopeLearningDataX)
                    {
                        Hla hlaIn  = merAndHlaAndLabel.Key.Second;
                        Hla hlaOut = hlaForNormalization(hlaIn);

                        Dictionary <Hla, Dictionary <Hla, bool> > hla2ToHlaToLabel = SpecialFunctions.GetValueOrDefault(epitopeLearningData, merAndHlaAndLabel.Key.First);
                        Dictionary <Hla, bool> hlaToLabel = SpecialFunctions.GetValueOrDefault(hla2ToHlaToLabel, hlaOut);
                        hlaToLabel.Add(hlaIn, merAndHlaAndLabel.Value);
                    }

                    EpitopeLearningDataList.Add(epitopeLearningData);
                }
            }

            HlaForNormalization = hlaForNormalization;
        }
        //!!!very similar to other code
        public static Dictionary <Pair <NEC, Hla>, bool> ReadTable(HlaFactory hlaFactory, string fileName, bool dedup)
        {
            Dictionary <Pair <NEC, Hla>, bool> table = new Dictionary <Pair <NEC, Hla>, bool>();

            foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(fileName, "N\tepitope\tC\thla\tlabel", false))
            {
                string n       = row["N"];
                string epitope = row["epitope"];
                SpecialFunctions.CheckCondition(Biology.GetInstance().LegalPeptide(epitope), string.Format("Peptide, '{0}', contains illegal char.", epitope));
                string c           = row["C"];
                NEC    nec         = NEC.GetInstance(n, epitope, c);
                Hla    hla         = hlaFactory.GetGroundInstance(row["hla"]);
                string labelString = row["label"];
                SpecialFunctions.CheckCondition(labelString == "0" || labelString == "1", "Expect label to be '0' or '1'");
                Pair <NEC, Hla> pair        = new Pair <NEC, Hla>(nec, hla);
                bool            labelAsBool = (labelString == "1");
                if (dedup && table.ContainsKey(pair))
                {
                    SpecialFunctions.CheckCondition(table[pair] == labelAsBool, "The example " + pair.ToString() + " appears with contradictory labels.");
                    continue;
                }
                table.Add(pair, labelAsBool);
            }
            return(table);
        }
        /* From [Microsoft Research]:
         *
         *  - I’ve changed two things wrt prior corrections.  First, I’m computing relative frequencies
         *    across length per HLA rather than per supertype (there was too much variation within
         *    supertype).  Second, the formula that I gave you last was not quite right in that it did not
         *    take into account the denominator of the prior odds term.  Given p_kh, the uncorrected
         *    probability of being an epitope according to the classifier for peptide of length k and
         *    HLA h, the correction is as follows:
         *
         *  log odds  := ln (p_kh/(1-p_kh))
         *  log odds := log odds + ln(  [relFreq_kh/0.25 * (1/100)] / [1 – relFreq_kh/0.25 * (1/100)] )
         *  pk_corrected = exp(log odds) / (1 + exp(log odds))
         *
         *  (Technical notes: In training, we are assuming a prior of 1/100 for each hla and k.
         *   In the data, the prior over hla is not uniform (e.g., there is lots of A02), but we think
         *   this is sampling bias.  That is, we think the prior on being an epitope is roughly'
         *   uniform for each hla.  But, the data is fairly unbiased wrt prior on epitope of length
         *   k reacting, given HLA.  That is, biologists were looking at particular HLAs, but they
         *   then found the optimal length for the epitope, giving an unbiased view of which lengths
         *   react with which HLAs.  Thus, for every HLA, we should correct the prior as a function
         *   of length.  We used to correct by supertype, but I’m seeing too much variation within
         *   a given supertype.  To help with smoothing, I’m using a Dirichlet(1,1,1,1) prior.
         *   Dividing each relFreq by 0.25 in the above formula guarantees that the overall prior is
         *   still 1/100.)
         *
         *
         *  From: [Microsoft Research]
         *  Sent: Thursday, July 27, 2006 4:25 PM
         *
         *
         *      As we discussed, I would like to write out the weight of evidence for the epitope rather
         *      than its posterior probability.  This is logOdds minus the prior (which is implicitly 1/100
         *      in our training data).
         *
         *      The formula for weight of evidence is (assuming 4 values of K, and 99 negatives per positive)
         *
         *          priorLogOddsOfThisLengthAndHla = LogOdds((relFreq/.25) * .01);
         *          originalLogOdds = LogOdds(originalP);
         *          correctedLogOdds = originalLogOdds + priorLogOddsOfThisLengthAndHla;
         *          weightofEvidence = correctedLogOdds – LogOdds(0.01);
         *
         */
        private void CreateKToHlaToPriorLogOdds()
        {
            KToHlaToPriorLogOdds = new Dictionary <int, Dictionary <Hla, double> >();
            _hlaSet = new Set <Hla>();
            HlaFactory hlaFactory = HlaFactory.GetFactory("MixedWithB15AndA68");

            _supertypeMap = new Dictionary <string, Set <Hla> >();


            Dictionary <Hla, Dictionary <int, int> > hlaToLengthToLengthToSmoothedCount = CreateHlaToLengthToLengthToSmoothedCount();

            foreach (Hla hla in hlaToLengthToLengthToSmoothedCount.Keys)
            {
                _hlaSet.AddNewOrOld(hla);

                Dictionary <int, int> lengthToSmoothedCount = hlaToLengthToLengthToSmoothedCount[hla];
                int smoothedTotal = ComputeSmoothedTotal(lengthToSmoothedCount);

                for (int k = (int)MerLength.firstLength; k <= (int)MerLength.lastLength; ++k)
                {
                    AddToHlaToPriorLogOdds(hla, lengthToSmoothedCount, smoothedTotal, k);
                }

                AddToSupertypeMap(hla);
            }

            AssertThatEveryKHasEveryHla();
        }
コード例 #4
0
        public double Predict(List <Dictionary <string, string> > patientTable, NEC nec, bool modelOnly)
        {
            double predictedPTotal = 0.0;

            foreach (Dictionary <string, string> patientRow in patientTable)
            {
                double product = 1.0;
                foreach (KeyValuePair <string, string> columnAndValue in patientRow)
                {
                    Hla hla = HlaFactory.GetGroundInstance(columnAndValue.Key.Substring(0, 1) + columnAndValue.Value);
                    Debug.Assert(nec.N.Length == SampleNEC.N.Length && nec.E.Length == SampleNEC.E.Length && nec.C.Length == SampleNEC.C.Length); // real assert
                    string sourceIgnore;
                    double probability = Predict(nec, hla, modelOnly, out sourceIgnore);
                    product *= 1.0 - probability;
                }
                double noiseyOrForThisPatient = 1.0 - product;
                predictedPTotal += noiseyOrForThisPatient;
            }
            double predictedP = predictedPTotal / (double)patientTable.Count;

            return(predictedP);
        }
コード例 #5
0
        //internal EpitopeLearningData[] Split(int cCrossValPart, Random aRandom)
        //{
        //    List<KeyValuePair<MerAndHlaToLength, bool>> shuffleList = new List<KeyValuePair<MerAndHlaToLength, bool>>();
        //    foreach (KeyValuePair<Pair<string, Hla>, bool> merAndHlaToLengthWithLabel in this)
        //    {
        //        shuffleList.Add(merAndHlaToLengthWithLabel);
        //        int iRandomPos = aRandom.Next(shuffleList.Count);
        //        shuffleList[shuffleList.Count - 1] = shuffleList[iRandomPos];
        //        shuffleList[iRandomPos] = merAndHlaToLengthWithLabel;
        //    }

        //    EpitopeLearningData[] rgrg = new EpitopeLearningData[cCrossValPart];
        //    for (int irgrg = 0; irgrg < rgrg.Length; ++irgrg)
        //    {
        //        rgrg[irgrg] = new EpitopeLearningData(string.Format("{0}{1}", Name, irgrg));
        //    }
        //    for (int iShuffleList = 0; iShuffleList < shuffleList.Count; ++iShuffleList)
        //    {
        //        KeyValuePair<MerAndHlaToLength, bool> merAndHlaToLengthWithLabel = shuffleList[iShuffleList];
        //        int iSet = iShuffleList * cCrossValPart / shuffleList.Count;
        //        rgrg[iSet].Add(merAndHlaToLengthWithLabel.Key, merAndHlaToLengthWithLabel.Value);
        //    }
        //    return rgrg;
        //}


        public static EpitopeLearningData GetDbWhole(HlaFactory hlaFactory, int eLength, string datasetName, string fileOrResourceName)
        {
            Set <string>        wantedSet = CreateSourceSet(datasetName);
            EpitopeLearningData rg        = new EpitopeLearningData(datasetName);

            //SpecialFunctions.CheckCondition(hlaResolution.Equals(HlaResolution.ABMixed));
            foreach (Dictionary <string, string> row in Predictor.TabFileTable(fileOrResourceName, "peptide	hla	source	label", false))
            {
                string peptide = row["peptide"];
                SpecialFunctions.CheckCondition(Biology.GetInstance().LegalPeptide(peptide), string.Format("Peptide, '{0}', contains illegal char.", peptide));

                if (peptide.Length != eLength) //!!!const
                {
                    continue;
                }

                string       source      = row["source"];
                Set <string> providedSet = CreateSourceSet(source);
                //Debug.Assert(providedSet.IsSubsetOf(Set<string>.GetInstance(new string[] { "Aplus", "LANL", "IEDB" }))); // real assert
                if (providedSet.IntersectionIsEmpty(wantedSet))
                {
                    continue;
                }


                Hla hla = hlaFactory.GetGroundInstance(row["hla"]);
                //HlaToLength hlaToLength = HlaToLength.GetInstance(hla, hlaResolution);
                Pair <string, Hla> peptideAndHla = new Pair <string, Hla>(peptide, hla);
                //MerAndHlaToLength aMerAndHlaToLength = MerAndHlaToLength.GetInstance(peptide, hlaToLength, kmerDefinition);

                string label = row["label"];
                SpecialFunctions.CheckCondition(label == "0" || label == "1", string.Format("Warning: Epitope example {0} has unknown label {1} and will be ignored.", peptideAndHla, label));
                rg[peptideAndHla] = (label == "1");
            }

            return(rg);
        }