public void SplitOnProtein(string niceName, string outputDirectory) { Debug.WriteLine(niceName); Dictionary <string, Dictionary <string, string> > proteinToCaseIdToSequence = new Dictionary <string, Dictionary <string, string> >(); foreach (string caseId in _caseIdToAASeq.Keys) { AASeq aaSeq = _caseIdToAASeq[caseId]; string previousProtein = null; Dictionary <string, StringBuilder> proteinToSequence = new Dictionary <string, StringBuilder>(); for (int aa0Pos = 0; aa0Pos < (int)SequenceLengthOrNull; ++aa0Pos) { string posName = aaSeq.OriginalAA1Position(aa0Pos); string[] posParts = posName.Split('@'); string protein = posParts[0]; StringBuilder sequence = proteinToSequence.GetValueOrDefault(protein); if (previousProtein != protein) { Helper.CheckCondition(sequence.Length == 0, "Expect proteins to be contintiguous"); previousProtein = protein; } Set <char> strainAASet = aaSeq[aa0Pos]; sequence.Append(AASeq.AaAsString(strainAASet)); } foreach (string protein in proteinToSequence.Keys) { Dictionary <string, string> caseIdToSequence = proteinToCaseIdToSequence.GetValueOrDefault(protein); caseIdToSequence.Add(caseId, proteinToSequence[protein].ToString()); } } foreach (string protein in proteinToCaseIdToSequence.Keys) { string outputFileName = string.Format(@"{0}\{1}.{2}.aaSeq.txt", outputDirectory, protein, niceName); using (TextWriter textWriter = File.CreateText(outputFileName)) { textWriter.WriteLine("cid\taaSeq"); //!!!const Dictionary <string, string> caseIdToSequence = proteinToCaseIdToSequence[protein]; foreach (string caseId in caseIdToSequence.Keys) { textWriter.WriteLine(Helper.CreateTabString(caseId, caseIdToSequence[caseId])); } } } }
public void TriplesAppend(ref Set <string> seenTriple, ref Dictionary <string, List <string> > proteinToTripleList) { AASeq aaSeq = GetFirstAASeq(); for (int aa0Pos = 0; aa0Pos < (int)SequenceLengthOrNull; ++aa0Pos) { string posName = aaSeq.OriginalAA1Position(aa0Pos); string[] posParts = posName.Split('@'); string protein = posParts[0]; string hxb2Pos = posParts[2]; string triple = Helper.CreateTabString(protein, hxb2Pos); if (!seenTriple.Contains(triple)) { seenTriple.AddNew(triple); List <string> tripleList = proteinToTripleList.GetValueOrDefault(protein); tripleList.Add(triple); } } }
static public AASeq GetCompressedInstance(string caseId, AASeq aaSeqIn, bool stopOnStop, TextWriter errorStream) { AASeq aaSeqOut = new AASeq(aaSeqIn.MixtureSemantics); aaSeqOut.Sequence = new List <Set <char> >(); aaSeqOut._originalAA1PositionTableOrNull = new List <string>(); Debug.Assert(aaSeqOut.Offset == 0); // real assert for (int iChar = 0; iChar < aaSeqIn.Count; ++iChar) { Set <char> set = aaSeqIn[iChar]; string originalAA1Position = aaSeqIn.OriginalAA1Position(iChar); if (set.Equals(Delete)) //!!!const { continue; } if (set.Equals(Stop)) //!!!const { if (iChar != aaSeqIn.Count - 1) { errorStream.WriteLine("Warning: The sequence for case id '{0}' contains a '*' before the last position", caseId); if (stopOnStop) { break; } } else { break; } } aaSeqOut.Sequence.Add(set); aaSeqOut._originalAA1PositionTableOrNull.Add(originalAA1Position); } return(aaSeqOut); }
public IEnumerable <string> SparseLineEnumeration(bool keepOneValueVariables) { if (_caseIdToAASeq.Count == 0) { Debug.Assert(SequenceLengthOrNull == null); // real assert yield break; } Helper.CheckCondition(SequenceLengthOrNull != null, "This converter to sparse assumes all sequences have the same length"); /* * n1pos aa pid val * 880 A 3 F * 880 A 5 F * 880 A 9 F * 880 A 13 F * 880 A 14 F * 880 A 15 T * ... */ for (int aa0Pos = 0; aa0Pos < (int)SequenceLengthOrNull; ++aa0Pos) { Set <char> everyAminoAcid = EveryAminoAcid(aa0Pos); if (!keepOneValueVariables && everyAminoAcid.Count == 1) { continue; } string posName = null; foreach (char aa in everyAminoAcid) { Set <bool> valueSet = Set <bool> .GetInstance(); Dictionary <string, bool> caseToVal = new Dictionary <string, bool>(); foreach (string caseId in _caseIdToAASeq.Keys) { AASeq aaSeq = _caseIdToAASeq[caseId]; if (aa0Pos >= aaSeq.Count) { continue; } //Helper.CheckCondition(aaSeq.IsUsingOriginalPositions(), "This converter to sparse assumes all sequences are using their original positions"); Set <char> strainAASet = aaSeq[aa0Pos]; if (posName == null) { posName = aaSeq.OriginalAA1Position(aa0Pos); //if (posName.Contains("68.3B")) //{ // Console.WriteLine("Found it first"); //} } else { Helper.CheckCondition(posName == aaSeq.OriginalAA1Position(aa0Pos)); } // missing: e.g. A/Any or A/AB // 1: e.g. A/A // 0: e.g. A/B or A/BCD if (strainAASet.Equals(AASeq.Any)) { //Do nothing - missing } else if (strainAASet.Contains(aa)) { if (strainAASet.Count > 1) { switch (aaSeq.MixtureSemantics) { case MixtureSemantics.Pure: caseToVal.Add(caseId, false); valueSet.AddNewOrOld(false); break; case MixtureSemantics.Uncertainty: // Do nothing = missing break; case MixtureSemantics.Any: caseToVal.Add(caseId, true); valueSet.AddNewOrOld(true); break; default: Helper.CheckCondition(false, "Unknown mixturesemantics " + aaSeq.MixtureSemantics.ToString()); break; } } else { caseToVal.Add(caseId, true); valueSet.AddNewOrOld(true); } } else { caseToVal.Add(caseId, false); valueSet.AddNewOrOld(false); } } Helper.CheckCondition(posName != null); if (keepOneValueVariables || valueSet.Count == 2) { foreach (KeyValuePair <string, bool> caseIdAndVal in caseToVal) { string variableName = string.Format("{0}@{1}", posName, aa); //string variableName = string.Format("{1}@{0}", posName, aa); //if (variableName.Contains("68.3B")) //{ // Console.WriteLine("Found it first"); //} yield return(Helper.CreateTabString( variableName, caseIdAndVal.Key, caseIdAndVal.Value ? 1 : 0)); } } } } }