public IEnumerable <string> SparseLineMerEnumeration(bool keepOneValueVariables, int merLength) { if (_caseIdToAASeq.Count == 0) { Debug.Assert(SequenceLengthOrNull == null); // real assert yield break; } Helper.CheckCondition(SequenceLengthOrNull != null, "This converter to sparse assumes all sequences have the same length"); Dictionary <string, AASeq> caseToCompressedAASeq = RemoveDeletesAndStopsFromData(false, Console.Error); foreach (string mer in EveryUnambiguousStopFreeMer(merLength, caseToCompressedAASeq)) { Regex merAsRegex = AASeq.CreateMerRegex(mer); //!!!look for similar code elsewhere foreach (string protein in EveryProtein()) { Set <bool> valueSet = Set <bool> .GetInstance(); Dictionary <string, bool> caseToVal = new Dictionary <string, bool>(); foreach (string caseId in caseToCompressedAASeq.Keys) { AASeq aaSeq = caseToCompressedAASeq[caseId]; Helper.CheckCondition(aaSeq.MixtureSemantics == MixtureSemantics.Uncertainty, "Code does not expect Mixture semantics"); bool?containsOrNull = aaSeq.ContainsMer(mer, merAsRegex, protein); if (null == containsOrNull) { continue; } else if ((bool)containsOrNull) { caseToVal.Add(caseId, true); valueSet.AddNewOrOld(true); } else { caseToVal.Add(caseId, false); valueSet.AddNewOrOld(false); } } if (keepOneValueVariables || valueSet.Count == 2) { foreach (KeyValuePair <string, bool> caseIdAndVal in caseToVal) { string variableName = protein + "@" + mer; yield return(Helper.CreateTabString( variableName, caseIdAndVal.Key, caseIdAndVal.Value ? 1 : 0)); } } } } }
//!!!same logic is elseware. Look for common heading private static Dictionary <string, Dictionary <string, double> > LoadReactTableUnfiltered(DbDataReader datareader, out Set <string> cidsInReactTable) { cidsInReactTable = Set <string> .GetInstance(); Dictionary <string, Dictionary <string, double> > reactTable = new Dictionary <string, Dictionary <string, double> >(); int indexPeptide = datareader.GetOrdinal("peptide"); int indexCID = datareader.GetOrdinal("cid"); int indexMagnitude = datareader.GetOrdinal("magnitude"); int irecord = 0; while (datareader.Read()) { ++irecord; string cid = datareader.GetString(indexCID).Trim(); if (cid.Length == 0) { continue; } cidsInReactTable.AddNewOrOld(cid); string peptide = datareader.GetString(indexPeptide).Trim(); double amount = datareader.GetDouble(indexMagnitude); Dictionary <string, double> peptideToAmount = SpecialFunctions.GetValueOrDefault(reactTable, peptide); peptideToAmount.Add(cid, amount); } Console.WriteLine("{0}: number of records read: {1}", "React table", irecord); return(reactTable); }
/* From [Microsoft Research]: * * - I’ve changed two things wrt prior corrections. First, I’m computing relative frequencies * across length per HLA rather than per supertype (there was too much variation within * supertype). Second, the formula that I gave you last was not quite right in that it did not * take into account the denominator of the prior odds term. Given p_kh, the uncorrected * probability of being an epitope according to the classifier for peptide of length k and * HLA h, the correction is as follows: * * log odds := ln (p_kh/(1-p_kh)) * log odds := log odds + ln( [relFreq_kh/0.25 * (1/100)] / [1 – relFreq_kh/0.25 * (1/100)] ) * pk_corrected = exp(log odds) / (1 + exp(log odds)) * * (Technical notes: In training, we are assuming a prior of 1/100 for each hla and k. * In the data, the prior over hla is not uniform (e.g., there is lots of A02), but we think * this is sampling bias. That is, we think the prior on being an epitope is roughly' * uniform for each hla. But, the data is fairly unbiased wrt prior on epitope of length * k reacting, given HLA. That is, biologists were looking at particular HLAs, but they * then found the optimal length for the epitope, giving an unbiased view of which lengths * react with which HLAs. Thus, for every HLA, we should correct the prior as a function * of length. We used to correct by supertype, but I’m seeing too much variation within * a given supertype. To help with smoothing, I’m using a Dirichlet(1,1,1,1) prior. * Dividing each relFreq by 0.25 in the above formula guarantees that the overall prior is * still 1/100.) * * * From: [Microsoft Research] * Sent: Thursday, July 27, 2006 4:25 PM * * * As we discussed, I would like to write out the weight of evidence for the epitope rather * than its posterior probability. This is logOdds minus the prior (which is implicitly 1/100 * in our training data). * * The formula for weight of evidence is (assuming 4 values of K, and 99 negatives per positive) * * priorLogOddsOfThisLengthAndHla = LogOdds((relFreq/.25) * .01); * originalLogOdds = LogOdds(originalP); * correctedLogOdds = originalLogOdds + priorLogOddsOfThisLengthAndHla; * weightofEvidence = correctedLogOdds – LogOdds(0.01); * */ private void CreateKToHlaToPriorLogOdds() { KToHlaToPriorLogOdds = new Dictionary <int, Dictionary <Hla, double> >(); _hlaSet = new Set <Hla>(); HlaFactory hlaFactory = HlaFactory.GetFactory("MixedWithB15AndA68"); _supertypeMap = new Dictionary <string, Set <Hla> >(); Dictionary <Hla, Dictionary <int, int> > hlaToLengthToLengthToSmoothedCount = CreateHlaToLengthToLengthToSmoothedCount(); foreach (Hla hla in hlaToLengthToLengthToSmoothedCount.Keys) { _hlaSet.AddNewOrOld(hla); Dictionary <int, int> lengthToSmoothedCount = hlaToLengthToLengthToSmoothedCount[hla]; int smoothedTotal = ComputeSmoothedTotal(lengthToSmoothedCount); for (int k = (int)MerLength.firstLength; k <= (int)MerLength.lastLength; ++k) { AddToHlaToPriorLogOdds(hla, lengthToSmoothedCount, smoothedTotal, k); } AddToSupertypeMap(hla); } AssertThatEveryKHasEveryHla(); }
private static Dictionary <string, Dictionary <string, int> > CreateMerStringToOriginalAA0PositionToCount(int merLength, TextWriter textWriterForWarnings, Dictionary <string, AASeq> caseToCompressedAASeq) { Dictionary <string, Dictionary <string, int> > merStringToOriginalAA0PositionToCount = new Dictionary <string, Dictionary <string, int> >(); foreach (string caseId in caseToCompressedAASeq.Keys) { AASeq aaSeq = caseToCompressedAASeq[caseId]; Set <string> SeenIt = new Set <string>(); foreach (AASeq mer in aaSeq.SubSeqEnumeration(merLength)) { if (mer.Ambiguous) { continue; } string merString = mer.ToString(); if (SeenIt.Contains(merString)) { textWriterForWarnings.WriteLine("Warning: Mer '{0}' appears again in case '{1}'", merString, caseId); } SeenIt.AddNewOrOld(merString); string originalAA1Position = mer.OriginalAA1Position(0); Dictionary <string, int> originalAA0PositionToCount = merStringToOriginalAA0PositionToCount.GetValueOrDefault(merString); originalAA0PositionToCount[originalAA1Position] = 1 + originalAA0PositionToCount.GetValueOrDefault(originalAA1Position); } } return(merStringToOriginalAA0PositionToCount); }
private void AddToSupertypeMap(Hla hla) { string supertypeAny = SetSupertypeAny(hla, HasBlanks); if (supertypeAny != "unknown" && supertypeAny != "none") //!!!"unknown" is a misnomer. Should be "none" or null, but don't want to change it because it is already in useful models. { Set <Hla> hlaSet = SpecialFunctions.GetValueOrDefault(_supertypeMap, supertypeAny); hlaSet.AddNewOrOld(hla); } }
//internal static TrueCollection GetInstanceX(IEnumerable<string> hlaCollection, Random random) //{ // TrueCollection aTrueCollection = new TrueCollection(); // foreach (string hla in hlaCollection) // { // if (random.Next(2) == 0) // { // aTrueCollection.Add(hla); // } // } // return aTrueCollection; //} public Set <Hla> CreateHlaAssignmentAsSet() { Set <Hla> hlaAssignmentAsSet = Set <Hla> .GetInstance(); foreach (Hla hla in this) { hlaAssignmentAsSet.AddNewOrOld(hla); } return(hlaAssignmentAsSet); }
//public bool IsUsingOriginalPositions() //{ // return _originalAA0PositionTableOrNull == null; //} internal IEnumerable <string> EveryProtein() { Set <string> proteinSet = Set <string> .GetInstance(); foreach (string pos in _originalAA1PositionTableOrNull) { string protein = pos.Split('@')[0]; proteinSet.AddNewOrOld(protein); } return(proteinSet); }
private void ReadPatientTable() { Qmrr.HlaFactory hlaFactory = Qmrr.HlaFactory.GetFactory("noConstraint"); PatientList = new Dictionary <string, Set <Hla> >(); foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(PatientFileName, "pid a1 a2 b1 b2 c1 c2", false)) //!!!const { string patientId = row["pid"]; Set <Hla> hlaList = new Set <Hla>(); foreach (string columnName in new string[] { "a1", "a2", "b1", "b2", "c1", "c2" }) //!!!const { hlaList.AddNewOrOld(hlaFactory.GetGroundInstance(row[columnName])); } PatientList.Add(patientId, hlaList); } }
private void CreateSwitchableHlasWithRespondingPatients() { Set <Hla> hlaSet = Set <Hla> .GetInstance(); foreach (string patient in PatientToAnyReaction.Keys) { if (PatientList.ContainsKey(patient)) { foreach (Hla hla in PatientList[patient]) { if (!hlaSet.Contains(hla)) { hlaSet.AddNewOrOld(hla); } } } } SwitchableHlasOfRespondingPatients = new List <Hla>(hlaSet); }
public static Dictionary <string, Dictionary <Hla, bool> > CreatePatientToHlaToYesNoDontKnow(HlaResolution hlaResolution, List <Dictionary <string, string> > expandedTable, string header, IEnumerable <Hla> hlaList) { Dictionary <string, Dictionary <Hla, bool> > patientToHlaToYesNoDontKnow = new Dictionary <string, Dictionary <Hla, bool> >(); foreach (Hla hla in hlaList) { Dictionary <string, Set <bool> > patientToSetOfHasHlaValues = new Dictionary <string, Set <bool> >(); foreach (Dictionary <string, string> row in expandedTable) { bool?hasHlaOrNull = HasHla(hla, row, hlaResolution); if (hasHlaOrNull != null) { string patient = row["patient"]; Set <bool> setOfHasHlaValues = SpecialFunctions.GetValueOrDefault(patientToSetOfHasHlaValues, patient); setOfHasHlaValues.AddNewOrOld((bool)hasHlaOrNull); } else { SpecialFunctions.CheckCondition(!row.ContainsKey("weight") || double.Parse(row["weight"]) == 1); } } foreach (string patient in patientToSetOfHasHlaValues.Keys) { Set <bool> setOfHasHlaValues = patientToSetOfHasHlaValues[patient]; if (setOfHasHlaValues.Count == 1) { foreach (bool hasHlaOrNull in setOfHasHlaValues) { Dictionary <Hla, bool> hlaToYesNoDontKnow = SpecialFunctions.GetValueOrDefault(patientToHlaToYesNoDontKnow, patient); hlaToYesNoDontKnow.Add(hla, (bool)hasHlaOrNull); } } else { //Debug.WriteLine(string.Format("For patient {0} and hla {1}, skipping because of ambiguious data", patient, hla)); } } } return(patientToHlaToYesNoDontKnow); }
//!!!this could be made faster by keeping track of patients with no abstract hlas private Dictionary <string, Set <Hla> > CreatePidToHlaSetCustom(Dictionary <string, Set <Hla> > pidToHlaSetAll, Set <Hla> bestHlaSetSoFar, Hla hla, Set <Hla> knownHlaSet //out Dictionary<string, Set<Hla>> pidToHlaSetCustom, //out Dictionary<string, Dictionary<string, double>> reactTableCustom ) { Set <Hla> possibleCauses = bestHlaSetSoFar.Union(knownHlaSet); possibleCauses.AddNewOrOld(hla); #if DEBUG foreach (Hla hlaPossibleCause in possibleCauses) { Debug.Assert(hlaPossibleCause.IsGround); // real assert } #endif Dictionary <string, Set <Hla> > pidToHlaSetCustom = new Dictionary <string, Set <Hla> >(); //reactTableCustom = new Dictionary<string, Dictionary<string, double>>(); foreach (string pid in pidToHlaSetAll.Keys) { Set <Hla> patientHlaSet = pidToHlaSetAll[pid]; //bestSoFar/known Hla PidContains ExcludePid? //B23 B25 B15?? No //B23 B1511 B15?? Yes if (!ThisPatientContainsAnAbstractHlaThatGeneralizesAPossibleCause(patientHlaSet, possibleCauses)) { pidToHlaSetCustom.Add(pid, patientHlaSet); // reactTableCustom.Add(pid, ReactTableUnfiltered[pid]); } else { //Debug.WriteLine(SpecialFunctions.CreateTabString(patientHlaSet, possibleCauses)); } } return(pidToHlaSetCustom); }
internal void SetPeptideToFitUniverse(string dataset) { Qmrr.HlaFactory hlaFactory = Qmrr.HlaFactory.GetFactory("noConstraint"); PeptideToFitUniverse = new Dictionary <string, Set <Hla> >(); string filename = dataset + "supertypefit.txt"; string line = null; //!!!would be nice to read as a tab table, to remove redundent lines, to check that HLAs are of the right form using (StreamReader streamReader = File.OpenText(filename)) { while (null != (line = streamReader.ReadLine())) { string[] fields = line.Split('\t'); SpecialFunctions.CheckCondition(fields.Length == 2); string peptide = fields[0]; Hla hla = hlaFactory.GetGroundInstance(fields[1]); Set <Hla> fitUniverse = SpecialFunctions.GetValueOrDefault(PeptideToFitUniverse, peptide); fitUniverse.AddNewOrOld(hla); } } }
internal Set <Hla> CreateUnivariateHlaSet(double pValueCutOff, Dictionary <string, Set <Hla> > pidToHlaSet, string peptide) { Set <Hla> univariateHlaSet = Set <Hla> .GetInstance(); foreach (Hla hla in HlaUniverse) { int[,] fourCounts = new int[2, 2]; //C# init's to 0's foreach (string pid in pidToHlaSet.Keys) { bool hasHla = pidToHlaSet[pid].Contains(hla); bool doesReact = ReactTableUnfiltered[peptide].ContainsKey(pid); ++fourCounts[hasHla ? 1 : 0, doesReact ? 1 : 0]; } double pValue = SpecialFunctions.FisherExactTest(fourCounts); if (pValue <= pValueCutOff) { univariateHlaSet.AddNewOrOld(hla); } } return(univariateHlaSet); }
protected Set <Hla> HlaSetFromReactingPatients(Dictionary <string, Set <Hla> > pidToHlaSet, string peptide) { Dictionary <string, double> pidToReactValue = ReactTableUnfiltered[peptide]; Set <string> patientsInHlaFile = Set <string> .GetInstance(pidToHlaSet.Keys); Set <string> patientsInReactFile = Set <string> .GetInstance(pidToReactValue.Keys); Set <string> commonPatients = patientsInReactFile.Intersection(patientsInHlaFile); Set <Hla> reactingPatientsHlas = Set <Hla> .GetInstance(); foreach (string pid in commonPatients) { foreach (Hla hla in pidToHlaSet[pid]) { if (hla.IsGround) { reactingPatientsHlas.AddNewOrOld(hla); } } } return(reactingPatientsHlas.Subtract(KnownTable(peptide))); }
private static Set <int> CreateTabulateReportInternal( string inputFilePattern, KeepTest <Dictionary <string, string> > keepTest, double maxPValue, bool auditRowIndexValues, ref List <Dictionary <string, string> > realRowCollectionToSort, ref List <double> nullValueCollectionToBeSorted, ref string headerSoFar) { Set <int> nullIndexSet = Set <int> .GetInstance(); //!!!very similar code elsewhere RowIndexTabulator rowIndexTabulator = RowIndexTabulator.GetInstance(auditRowIndexValues); //RangeCollection unfilteredRowIndexRangeCollection = RangeCollection.GetInstance(); foreach (string fileName in Directory.GetFiles(Directory.GetCurrentDirectory(), inputFilePattern)) { Debug.WriteLine(fileName); string headerOnFile; bool firstRow = true; foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(fileName, /*includeWholeLine*/ true, out headerOnFile)) { if (firstRow) { firstRow = false; if (headerSoFar == null) { headerSoFar = headerOnFile; } else if (headerSoFar != headerOnFile) { Console.WriteLine("Warning: The header for file {0} is different from the 1st file read in", fileName); } } if (rowIndexTabulator.TryAdd(row, fileName) && keepTest.Test(row)) { //int unfilteredRowIndex = ReadUnfilteredRowIndexButIfMissingUseRowIndex(row, rowIndex); //unfilteredRowIndexRangeCollection.Add(unfilteredRowIndex); SpecialFunctions.CheckCondition(row.ContainsKey(NullIndexColumnName), string.Format(@"When tabulating a ""{0}"" column is required. (File ""{1}"")", NullIndexColumnName, fileName)); int nullIndex = int.Parse(row[NullIndexColumnName]); nullIndexSet.AddNewOrOld(nullIndex); double pValue = AccessPValueFromPhylotreeRow(row); //if (double.IsNaN(pValue)) //{ // pValue = 1; // row["PValue"] = "1"; //} if (pValue <= maxPValue) { if (nullIndex == -1) { realRowCollectionToSort.Add(row); } else { nullValueCollectionToBeSorted.Add(pValue); } } } } } rowIndexTabulator.CheckIsComplete(inputFilePattern); return(nullIndexSet); }
public IEnumerable <string> SparseLineEnumeration(bool keepOneValueVariables) { if (_caseIdToAASeq.Count == 0) { Debug.Assert(SequenceLengthOrNull == null); // real assert yield break; } Helper.CheckCondition(SequenceLengthOrNull != null, "This converter to sparse assumes all sequences have the same length"); /* * n1pos aa pid val * 880 A 3 F * 880 A 5 F * 880 A 9 F * 880 A 13 F * 880 A 14 F * 880 A 15 T * ... */ for (int aa0Pos = 0; aa0Pos < (int)SequenceLengthOrNull; ++aa0Pos) { Set <char> everyAminoAcid = EveryAminoAcid(aa0Pos); if (!keepOneValueVariables && everyAminoAcid.Count == 1) { continue; } string posName = null; foreach (char aa in everyAminoAcid) { Set <bool> valueSet = Set <bool> .GetInstance(); Dictionary <string, bool> caseToVal = new Dictionary <string, bool>(); foreach (string caseId in _caseIdToAASeq.Keys) { AASeq aaSeq = _caseIdToAASeq[caseId]; if (aa0Pos >= aaSeq.Count) { continue; } //Helper.CheckCondition(aaSeq.IsUsingOriginalPositions(), "This converter to sparse assumes all sequences are using their original positions"); Set <char> strainAASet = aaSeq[aa0Pos]; if (posName == null) { posName = aaSeq.OriginalAA1Position(aa0Pos); //if (posName.Contains("68.3B")) //{ // Console.WriteLine("Found it first"); //} } else { Helper.CheckCondition(posName == aaSeq.OriginalAA1Position(aa0Pos)); } // missing: e.g. A/Any or A/AB // 1: e.g. A/A // 0: e.g. A/B or A/BCD if (strainAASet.Equals(AASeq.Any)) { //Do nothing - missing } else if (strainAASet.Contains(aa)) { if (strainAASet.Count > 1) { switch (aaSeq.MixtureSemantics) { case MixtureSemantics.Pure: caseToVal.Add(caseId, false); valueSet.AddNewOrOld(false); break; case MixtureSemantics.Uncertainty: // Do nothing = missing break; case MixtureSemantics.Any: caseToVal.Add(caseId, true); valueSet.AddNewOrOld(true); break; default: Helper.CheckCondition(false, "Unknown mixturesemantics " + aaSeq.MixtureSemantics.ToString()); break; } } else { caseToVal.Add(caseId, true); valueSet.AddNewOrOld(true); } } else { caseToVal.Add(caseId, false); valueSet.AddNewOrOld(false); } } Helper.CheckCondition(posName != null); if (keepOneValueVariables || valueSet.Count == 2) { foreach (KeyValuePair <string, bool> caseIdAndVal in caseToVal) { string variableName = string.Format("{0}@{1}", posName, aa); //string variableName = string.Format("{1}@{0}", posName, aa); //if (variableName.Contains("68.3B")) //{ // Console.WriteLine("Found it first"); //} yield return(Helper.CreateTabString( variableName, caseIdAndVal.Key, caseIdAndVal.Value ? 1 : 0)); } } } } }
public IEnumerable <string> SparseLineEnumeration(bool keepOneValueVariables) { if (_caseIdToAASeq.Count == 0) { Debug.Assert(SequenceLength == null); // real assert yield break; } SpecialFunctions.CheckCondition(SequenceLength != null, "This converter to sparse assumes all sequences have the same length"); /* * n1pos aa pid val * 880 A 3 F * 880 A 5 F * 880 A 9 F * 880 A 13 F * 880 A 14 F * 880 A 15 T * ... */ for (int aa0Pos = 0; aa0Pos < (int)SequenceLength; ++aa0Pos) { Set <char> everyAminoAcid = EveryAminoAcid(aa0Pos); if (!keepOneValueVariables && everyAminoAcid.Count == 1) { continue; } string posName = null; foreach (char aa in everyAminoAcid) { Set <bool> valueSet = Set <bool> .GetInstance(); Dictionary <string, bool> caseToVal = new Dictionary <string, bool>(); foreach (string caseId in _caseIdToAASeq.Keys) { AASeq aaSeq = _caseIdToAASeq[caseId]; //SpecialFunctions.CheckCondition(aaSeq.IsUsingOriginalPositions(), "This converter to sparse assumes all sequences are using their original positions"); Set <char> strainAASet = aaSeq[aa0Pos]; if (posName == null) { posName = aaSeq.OriginalAA1Position(aa0Pos); } else { SpecialFunctions.CheckCondition(posName == aaSeq.OriginalAA1Position(aa0Pos)); } // missing: e.g. A/Any or A/AB // 1: e.g. A/A // 0: e.g. A/B or A/BCD if (strainAASet.Equals(AASeq.Any)) { //Do nothing - missing } else if (strainAASet.Contains(aa)) { if (strainAASet.Count > 1) { if (aaSeq.Mixture) { caseToVal.Add(caseId, false); valueSet.AddNewOrOld(false); } else { // Do nothing = missing } } else { caseToVal.Add(caseId, true); valueSet.AddNewOrOld(true); } } else { caseToVal.Add(caseId, false); valueSet.AddNewOrOld(false); } } SpecialFunctions.CheckCondition(posName != null); if (keepOneValueVariables || valueSet.Count == 2) { foreach (KeyValuePair <string, bool> caseIdAndVal in caseToVal) { //string variableName = string.Format("{0}@{1}", posName, aa); string variableName = string.Format("{1}@{0}", posName, aa); yield return(SpecialFunctions.CreateTabString( variableName, caseIdAndVal.Key, caseIdAndVal.Value ? 1 : 0)); } } } } }