Ejemplo n.º 1
0
        public void SplitOnProtein(string niceName, string outputDirectory)
        {
            Debug.WriteLine(niceName);
            Dictionary <string, Dictionary <string, string> > proteinToCaseIdToSequence = new Dictionary <string, Dictionary <string, string> >();

            foreach (string caseId in _caseIdToAASeq.Keys)
            {
                AASeq aaSeq = _caseIdToAASeq[caseId];

                string previousProtein = null;
                Dictionary <string, StringBuilder> proteinToSequence = new Dictionary <string, StringBuilder>();
                for (int aa0Pos = 0; aa0Pos < (int)SequenceLengthOrNull; ++aa0Pos)
                {
                    string        posName  = aaSeq.OriginalAA1Position(aa0Pos);
                    string[]      posParts = posName.Split('@');
                    string        protein  = posParts[0];
                    StringBuilder sequence = proteinToSequence.GetValueOrDefault(protein);
                    if (previousProtein != protein)
                    {
                        Helper.CheckCondition(sequence.Length == 0, "Expect proteins to be contintiguous");
                        previousProtein = protein;
                    }

                    Set <char> strainAASet = aaSeq[aa0Pos];
                    sequence.Append(AASeq.AaAsString(strainAASet));
                }

                foreach (string protein in proteinToSequence.Keys)
                {
                    Dictionary <string, string> caseIdToSequence = proteinToCaseIdToSequence.GetValueOrDefault(protein);
                    caseIdToSequence.Add(caseId, proteinToSequence[protein].ToString());
                }
            }

            foreach (string protein in proteinToCaseIdToSequence.Keys)
            {
                string outputFileName = string.Format(@"{0}\{1}.{2}.aaSeq.txt", outputDirectory, protein, niceName);
                using (TextWriter textWriter = File.CreateText(outputFileName))
                {
                    textWriter.WriteLine("cid\taaSeq"); //!!!const
                    Dictionary <string, string> caseIdToSequence = proteinToCaseIdToSequence[protein];
                    foreach (string caseId in caseIdToSequence.Keys)
                    {
                        textWriter.WriteLine(Helper.CreateTabString(caseId, caseIdToSequence[caseId]));
                    }
                }
            }
        }
Ejemplo n.º 2
0
        public void TriplesAppend(ref Set <string> seenTriple, ref Dictionary <string, List <string> > proteinToTripleList)
        {
            AASeq aaSeq = GetFirstAASeq();

            for (int aa0Pos = 0; aa0Pos < (int)SequenceLengthOrNull; ++aa0Pos)
            {
                string   posName  = aaSeq.OriginalAA1Position(aa0Pos);
                string[] posParts = posName.Split('@');
                string   protein  = posParts[0];
                string   hxb2Pos  = posParts[2];

                string triple = Helper.CreateTabString(protein, hxb2Pos);
                if (!seenTriple.Contains(triple))
                {
                    seenTriple.AddNew(triple);
                    List <string> tripleList = proteinToTripleList.GetValueOrDefault(protein);
                    tripleList.Add(triple);
                }
            }
        }
Ejemplo n.º 3
0
        static public AASeq GetCompressedInstance(string caseId, AASeq aaSeqIn, bool stopOnStop, TextWriter errorStream)
        {
            AASeq aaSeqOut = new AASeq(aaSeqIn.MixtureSemantics);

            aaSeqOut.Sequence = new List <Set <char> >();
            aaSeqOut._originalAA1PositionTableOrNull = new List <string>();
            Debug.Assert(aaSeqOut.Offset == 0); // real assert


            for (int iChar = 0; iChar < aaSeqIn.Count; ++iChar)
            {
                Set <char> set = aaSeqIn[iChar];
                string     originalAA1Position = aaSeqIn.OriginalAA1Position(iChar);
                if (set.Equals(Delete)) //!!!const
                {
                    continue;
                }
                if (set.Equals(Stop)) //!!!const
                {
                    if (iChar != aaSeqIn.Count - 1)
                    {
                        errorStream.WriteLine("Warning: The sequence for case id '{0}' contains a '*' before the last position", caseId);
                        if (stopOnStop)
                        {
                            break;
                        }
                    }
                    else
                    {
                        break;
                    }
                }
                aaSeqOut.Sequence.Add(set);
                aaSeqOut._originalAA1PositionTableOrNull.Add(originalAA1Position);
            }
            return(aaSeqOut);
        }
Ejemplo n.º 4
0
        public IEnumerable <string> SparseLineEnumeration(bool keepOneValueVariables)
        {
            if (_caseIdToAASeq.Count == 0)
            {
                Debug.Assert(SequenceLengthOrNull == null); // real assert
                yield break;
            }
            Helper.CheckCondition(SequenceLengthOrNull != null, "This converter to sparse assumes all sequences have the same length");

            /*
             * n1pos	aa	pid	val
             * 880	A	3	F
             * 880	A	5	F
             * 880	A	9	F
             * 880	A	13	F
             * 880	A	14	F
             * 880	A	15	T
             * ...
             */


            for (int aa0Pos = 0; aa0Pos < (int)SequenceLengthOrNull; ++aa0Pos)
            {
                Set <char> everyAminoAcid = EveryAminoAcid(aa0Pos);
                if (!keepOneValueVariables && everyAminoAcid.Count == 1)
                {
                    continue;
                }

                string posName = null;
                foreach (char aa in everyAminoAcid)
                {
                    Set <bool> valueSet = Set <bool> .GetInstance();

                    Dictionary <string, bool> caseToVal = new Dictionary <string, bool>();
                    foreach (string caseId in _caseIdToAASeq.Keys)
                    {
                        AASeq aaSeq = _caseIdToAASeq[caseId];

                        if (aa0Pos >= aaSeq.Count)
                        {
                            continue;
                        }

                        //Helper.CheckCondition(aaSeq.IsUsingOriginalPositions(), "This converter to sparse assumes all sequences are using their original positions");
                        Set <char> strainAASet = aaSeq[aa0Pos];
                        if (posName == null)
                        {
                            posName = aaSeq.OriginalAA1Position(aa0Pos);
                            //if (posName.Contains("68.3B"))
                            //{
                            //    Console.WriteLine("Found it first");
                            //}
                        }
                        else
                        {
                            Helper.CheckCondition(posName == aaSeq.OriginalAA1Position(aa0Pos));
                        }
                        // missing: e.g.  A/Any   or   A/AB
                        // 1: e.g. A/A
                        // 0: e.g. A/B	or  A/BCD
                        if (strainAASet.Equals(AASeq.Any))
                        {
                            //Do nothing - missing
                        }
                        else if (strainAASet.Contains(aa))
                        {
                            if (strainAASet.Count > 1)
                            {
                                switch (aaSeq.MixtureSemantics)
                                {
                                case MixtureSemantics.Pure:
                                    caseToVal.Add(caseId, false);
                                    valueSet.AddNewOrOld(false);
                                    break;

                                case MixtureSemantics.Uncertainty:
                                    // Do nothing = missing
                                    break;

                                case MixtureSemantics.Any:
                                    caseToVal.Add(caseId, true);
                                    valueSet.AddNewOrOld(true);
                                    break;

                                default:
                                    Helper.CheckCondition(false, "Unknown mixturesemantics " + aaSeq.MixtureSemantics.ToString());
                                    break;
                                }
                            }
                            else
                            {
                                caseToVal.Add(caseId, true);
                                valueSet.AddNewOrOld(true);
                            }
                        }
                        else
                        {
                            caseToVal.Add(caseId, false);
                            valueSet.AddNewOrOld(false);
                        }
                    }
                    Helper.CheckCondition(posName != null);
                    if (keepOneValueVariables || valueSet.Count == 2)
                    {
                        foreach (KeyValuePair <string, bool> caseIdAndVal in caseToVal)
                        {
                            string variableName = string.Format("{0}@{1}", posName, aa);
                            //string variableName = string.Format("{1}@{0}", posName, aa);
                            //if (variableName.Contains("68.3B"))
                            //{
                            //    Console.WriteLine("Found it first");
                            //}
                            yield return(Helper.CreateTabString(
                                             variableName, caseIdAndVal.Key, caseIdAndVal.Value ? 1 : 0));
                        }
                    }
                }
            }
        }