Пример #1
0
        private static Dictionary <string, Dictionary <string, int> > CreateMerStringToOriginalAA0PositionToCount(int merLength, TextWriter textWriterForWarnings, Dictionary <string, AASeq> caseToCompressedAASeq)
        {
            Dictionary <string, Dictionary <string, int> > merStringToOriginalAA0PositionToCount = new Dictionary <string, Dictionary <string, int> >();

            foreach (string caseId in caseToCompressedAASeq.Keys)
            {
                AASeq aaSeq = caseToCompressedAASeq[caseId];

                Set <string> SeenIt = new Set <string>();
                foreach (AASeq mer in aaSeq.SubSeqEnumeration(merLength))
                {
                    if (mer.Ambiguous)
                    {
                        continue;
                    }

                    string merString = mer.ToString();
                    if (SeenIt.Contains(merString))
                    {
                        textWriterForWarnings.WriteLine("Warning: Mer '{0}' appears again in case '{1}'", merString, caseId);
                    }
                    SeenIt.AddNewOrOld(merString);

                    string originalAA1Position = mer.OriginalAA1Position(0);

                    Dictionary <string, int> originalAA0PositionToCount = merStringToOriginalAA0PositionToCount.GetValueOrDefault(merString);
                    originalAA0PositionToCount[originalAA1Position] = 1 + originalAA0PositionToCount.GetValueOrDefault(originalAA1Position);
                }
            }
            return(merStringToOriginalAA0PositionToCount);
        }
Пример #2
0
 public IEnumerable <AASeq> SubSeqEnumeration(int merLength)
 {
     for (int startIndex = 0; startIndex <= Sequence.Count - merLength; ++startIndex)
     {
         AASeq aaSeqOut = SubSeqAA0Pos(startIndex, merLength);
         yield return(aaSeqOut);
     }
 }
Пример #3
0
        static public AASeq GetInstance(string aaSeqAsString, MixtureSemantics mixtureSemantics, int offset)
        {
            AASeq aaSeq = new AASeq(aaSeqAsString, mixtureSemantics, offset);

            //aaSeq.Sequence = aaSeq.CreateCharSetList(aaSeqAsString);
            //aaSeq.Offset = offset;
            return(aaSeq);
        }
Пример #4
0
 public bool TrySubSeqAA0Pos(int aa0Pos, int merLength, out AASeq aaSeq)
 {
     if (aa0Pos < 0 || aa0Pos + merLength > this.Sequence.Count)
     {
         aaSeq = null;
         return(false);
     }
     aaSeq = SubSeqAA0Pos(aa0Pos, merLength);
     return(true);
 }
Пример #5
0
        static public AASeq GetInstance(string aaSeqAsString, List <string> originalAA1PositionTable, MixtureSemantics mixtureSemantics)
        {
            Helper.CheckCondition(null != originalAA1PositionTable, "must give a position table");
            AASeq aaSeq = new AASeq(mixtureSemantics);

            aaSeq.Sequence = aaSeq.CreateCharSetList(aaSeqAsString);
            Helper.CheckCondition(aaSeq.Count == originalAA1PositionTable.Count, "aaSeq and position table must be same length");
            aaSeq._originalAA1PositionTableOrNull = originalAA1PositionTable;
            return(aaSeq);
        }
Пример #6
0
        //public void CreateSparseFile(string outputFileName, bool keepOneValueVariables)
        //{
        //	CreateSparseFile(outputFileName, keepOneValueVariables);
        //}

        private Dictionary <string, AASeq> RemoveDeletesAndStopsFromData(bool stopOnStop, TextWriter textWriter)
        {
            Dictionary <string, AASeq> compressedDictionary = new Dictionary <string, AASeq>();

            foreach (KeyValuePair <string, AASeq> caseIdAndAASeq in _caseIdToAASeq)
            {
                AASeq compressedAASeq = AASeq.GetCompressedInstance(caseIdAndAASeq.Key, caseIdAndAASeq.Value, stopOnStop, textWriter);
                compressedDictionary.Add(caseIdAndAASeq.Key, compressedAASeq);
            }
            return(compressedDictionary);
        }
Пример #7
0
        public IEnumerable <string> SparseLineMerEnumeration(bool keepOneValueVariables, int merLength)
        {
            if (_caseIdToAASeq.Count == 0)
            {
                Debug.Assert(SequenceLengthOrNull == null); // real assert
                yield break;
            }
            Helper.CheckCondition(SequenceLengthOrNull != null, "This converter to sparse assumes all sequences have the same length");

            Dictionary <string, AASeq> caseToCompressedAASeq = RemoveDeletesAndStopsFromData(false, Console.Error);

            foreach (string mer in EveryUnambiguousStopFreeMer(merLength, caseToCompressedAASeq))
            {
                Regex merAsRegex = AASeq.CreateMerRegex(mer); //!!!look for similar code elsewhere

                foreach (string protein in EveryProtein())
                {
                    Set <bool> valueSet = Set <bool> .GetInstance();

                    Dictionary <string, bool> caseToVal = new Dictionary <string, bool>();
                    foreach (string caseId in caseToCompressedAASeq.Keys)
                    {
                        AASeq aaSeq = caseToCompressedAASeq[caseId];
                        Helper.CheckCondition(aaSeq.MixtureSemantics == MixtureSemantics.Uncertainty, "Code does not expect Mixture semantics");
                        bool?containsOrNull = aaSeq.ContainsMer(mer, merAsRegex, protein);
                        if (null == containsOrNull)
                        {
                            continue;
                        }
                        else if ((bool)containsOrNull)
                        {
                            caseToVal.Add(caseId, true);
                            valueSet.AddNewOrOld(true);
                        }
                        else
                        {
                            caseToVal.Add(caseId, false);
                            valueSet.AddNewOrOld(false);
                        }
                    }
                    if (keepOneValueVariables || valueSet.Count == 2)
                    {
                        foreach (KeyValuePair <string, bool> caseIdAndVal in caseToVal)
                        {
                            string variableName = protein + "@" + mer;
                            yield return(Helper.CreateTabString(
                                             variableName, caseIdAndVal.Key, caseIdAndVal.Value ? 1 : 0));
                        }
                    }
                }
            }
        }
Пример #8
0
        public override bool Equals(object obj)
        {
            AASeq other = obj as AASeq;

            if (other == null)
            {
                return(false);
            }
            else
            {
                return(ToString() == other.ToString());
            }
        }
Пример #9
0
        //	/*
        //	1189MB	MEPVDPNLEPWNHPGSQPKTPCTNCYCKHCSYHCLVCFQTKGLGISYGRK
        //	J112MA	MEPVDPNLEPWNHPGSQPITACNKCYCKYCSYHCLVCFQTKGLGISYGRK
        //	1157M3M   MEPVDPNLEPWNHPGSQPKTPCNKCYCKHCSYHCLVCFQTKGLGISYGRK
        //	1195MB	MEPVDPNLEPWNHPGSQPKTPCNKCYCKYCSYHCLVCFQTKGLGISYGRK
        //	 */
        static public CaseIdToAASeq GetInstance(TextReader textReader, MixtureSemantics mixtureSemantics, int offset)
        {
            CaseIdToAASeq caseIdToAASeq = CaseIdToAASeq.GetInstance();

            foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(textReader, "cid\taaSeq", false))
            {
                string caseId        = row["cid"];   //!!!const
                string aaSeqAsString = row["aaSeq"]; //!!!const
                AASeq  aaSeq         = AASeq.GetInstance(aaSeqAsString, mixtureSemantics, offset);
                caseIdToAASeq.Add(caseId, aaSeq);
            }

            return(caseIdToAASeq);
        }
Пример #10
0
        //static public CaseIdToAASeq GetInstance(TextReader textReader, bool mixture)
        //{
        //	return GetInstance(textReader, mixture, 0);
        //}


        public void Add(string caseId, AASeq aaSeq)
        {
            Helper.CheckCondition(!_caseIdToAASeq.ContainsKey(caseId), string.Format("caseId {0} appears more than once", caseId));
            //!!!1205x
            if (null == SequenceLengthOrNull)
            {
                SequenceLengthOrNull = aaSeq.Count;
            }
            if (SequenceLengthOrNull != aaSeq.Count)
            {
                Console.Error.WriteLine("Warning: Not all amino acid sequences are of the same length");
            }
            _caseIdToAASeq.Add(caseId, aaSeq);
        }
Пример #11
0
        public void SplitOnProtein(string niceName, string outputDirectory)
        {
            Debug.WriteLine(niceName);
            Dictionary <string, Dictionary <string, string> > proteinToCaseIdToSequence = new Dictionary <string, Dictionary <string, string> >();

            foreach (string caseId in _caseIdToAASeq.Keys)
            {
                AASeq aaSeq = _caseIdToAASeq[caseId];

                string previousProtein = null;
                Dictionary <string, StringBuilder> proteinToSequence = new Dictionary <string, StringBuilder>();
                for (int aa0Pos = 0; aa0Pos < (int)SequenceLengthOrNull; ++aa0Pos)
                {
                    string        posName  = aaSeq.OriginalAA1Position(aa0Pos);
                    string[]      posParts = posName.Split('@');
                    string        protein  = posParts[0];
                    StringBuilder sequence = proteinToSequence.GetValueOrDefault(protein);
                    if (previousProtein != protein)
                    {
                        Helper.CheckCondition(sequence.Length == 0, "Expect proteins to be contintiguous");
                        previousProtein = protein;
                    }

                    Set <char> strainAASet = aaSeq[aa0Pos];
                    sequence.Append(AASeq.AaAsString(strainAASet));
                }

                foreach (string protein in proteinToSequence.Keys)
                {
                    Dictionary <string, string> caseIdToSequence = proteinToCaseIdToSequence.GetValueOrDefault(protein);
                    caseIdToSequence.Add(caseId, proteinToSequence[protein].ToString());
                }
            }

            foreach (string protein in proteinToCaseIdToSequence.Keys)
            {
                string outputFileName = string.Format(@"{0}\{1}.{2}.aaSeq.txt", outputDirectory, protein, niceName);
                using (TextWriter textWriter = File.CreateText(outputFileName))
                {
                    textWriter.WriteLine("cid\taaSeq"); //!!!const
                    Dictionary <string, string> caseIdToSequence = proteinToCaseIdToSequence[protein];
                    foreach (string caseId in caseIdToSequence.Keys)
                    {
                        textWriter.WriteLine(Helper.CreateTabString(caseId, caseIdToSequence[caseId]));
                    }
                }
            }
        }
Пример #12
0
        public static NamedSequence GetConsensus(List <NamedSequence> seqs)
        {
            int len = -1;
            //StringBuilder consensusSeq = new StringBuilder();
            List <AASeq> aaSeqs = new List <AASeq>();
            bool         isDna  = seqs[0].IsDna();

            foreach (NamedSequence seq in seqs)
            {
                AASeq aaSeq = isDna ? DnaSeq.GetInstance(seq.Sequence, MixtureSemantics.Uncertainty) : AASeq.GetInstance(seq.Sequence, MixtureSemantics.Uncertainty);

                len = Math.Max(len, aaSeq.Count);
                //if (len < 0)
                //{
                //    len = aaSeq.Count;
                //}
                //else
                //{
                //    Helper.CheckCondition(len == aaSeq.Count, String.Format("Sequence {0} is a different length from previous sequences", seq.Name));
                //}
                aaSeqs.Add(aaSeq);
            }

            //for (int i = 0; i < len; i++)
            //{
            //    Dictionary<string, int> charToCount = new Dictionary<string, int>();
            //    KeyValuePair<string, int> currentConsensus = new KeyValuePair<string, int>("z", -1);
            //    foreach (AASeq aaSeq in aaSeqs)
            //    {
            //        if (i < aaSeq.Count)
            //        {
            //            string residue = aaSeq.SubSeqAA0Pos(i, 1).ToString();
            //            charToCount[residue] = SpecialFunctions.GetValueOrDefault(charToCount, residue) + 1;
            //            if (charToCount[residue] > currentConsensus.Value)
            //            {
            //                currentConsensus = new KeyValuePair<string, int>(residue, charToCount[residue]);
            //            }
            //        }
            //    }
            //    consensusSeq.Append(currentConsensus.Key);
            //}
            AASeq         consensusAaSeq = GetAaSeqConsensus(aaSeqs);
            NamedSequence consensus      = new NamedSequence("consensus", consensusAaSeq.ToString());

            return(consensus);
        }
Пример #13
0
        public AASeq SubSeqAA0Pos(int aa0Pos, int merLength, bool skipDelete)
        {
            List <Set <char> > subSequence = new List <Set <char> >();
            AASeq aaSeqOut = new AASeq(MixtureSemantics);

            aaSeqOut.Sequence = subSequence;
            aaSeqOut._originalAA1PositionTableOrNull = new List <string>();
            Debug.Assert(aaSeqOut.Offset == 0); // real assert
            for (int aa0 = aa0Pos; aa0 < aa0Pos + merLength; ++aa0)
            {
                Set <char> charSet = Sequence[aa0];
                if (!skipDelete || !AASeq.Delete.Equals(charSet))
                {
                    string originalAA1Position = OriginalAA1Position(aa0);
                    aaSeqOut._originalAA1PositionTableOrNull.Add(originalAA1Position);
                    subSequence.Add(charSet);
                }
            }
            return(aaSeqOut);
        }
Пример #14
0
        public void TriplesAppend(ref Set <string> seenTriple, ref Dictionary <string, List <string> > proteinToTripleList)
        {
            AASeq aaSeq = GetFirstAASeq();

            for (int aa0Pos = 0; aa0Pos < (int)SequenceLengthOrNull; ++aa0Pos)
            {
                string   posName  = aaSeq.OriginalAA1Position(aa0Pos);
                string[] posParts = posName.Split('@');
                string   protein  = posParts[0];
                string   hxb2Pos  = posParts[2];

                string triple = Helper.CreateTabString(protein, hxb2Pos);
                if (!seenTriple.Contains(triple))
                {
                    seenTriple.AddNew(triple);
                    List <string> tripleList = proteinToTripleList.GetValueOrDefault(protein);
                    tripleList.Add(triple);
                }
            }
        }
Пример #15
0
        public IEnumerable <AASeq> BlowOut()
        {
            foreach (List <char> possibleSeq in SpecialFunctions.EveryCombination <char>(this.Sequence))
            {
                List <Set <char> > subSequence = new List <Set <char> >();
                AASeq aaSeqOut = new AASeq(MixtureSemantics);
                aaSeqOut.Sequence = subSequence;
                aaSeqOut._originalAA1PositionTableOrNull = new List <string>();
                Debug.Assert(aaSeqOut.Offset == 0); // real assert
                for (int aa0 = 0; aa0 < Count; ++aa0)
                {
                    Set <char> charSet = Set <char> .GetInstance(possibleSeq[aa0]);

                    Helper.CheckCondition(!AASeq.Delete.Equals(charSet), "'delete' not expected");
                    string originalAA1Position = OriginalAA1Position(aa0);
                    aaSeqOut._originalAA1PositionTableOrNull.Add(originalAA1Position);
                    subSequence.Add(charSet);
                }
                yield return(aaSeqOut);
            }
        }
Пример #16
0
        private Dictionary <string, bool> FindMerValues(string merAsString, Dictionary <string, AASeq> caseToCompressedAASeq, out Dictionary <bool, int> valueToNonZeroCount)
        {
            Regex merAsRegex = AASeq.CreateMerRegex(merAsString);

            Dictionary <string, bool> merValues = new Dictionary <string, bool>();

            valueToNonZeroCount = new Dictionary <bool, int>();
            foreach (KeyValuePair <string, AASeq> caseIdAndCompressedAASeq in caseToCompressedAASeq)
            {
                string caseId          = caseIdAndCompressedAASeq.Key;
                AASeq  compressedAASeq = caseIdAndCompressedAASeq.Value;

                bool?containsMer = compressedAASeq.ContainsMer(merAsString, merAsRegex);

                if (null != containsMer)
                {
                    merValues.Add(caseId, (bool)containsMer);
                    valueToNonZeroCount[(bool)containsMer] = 1 + valueToNonZeroCount.GetValueOrDefault((bool)containsMer);
                }
            }
            return(merValues);
        }
Пример #17
0
        public static AASeq GetAaSeqConsensus(List <AASeq> aaSeqs)
        {
            int           len          = -1;
            StringBuilder consensusSeq = new StringBuilder();

            foreach (AASeq seq in aaSeqs)
            {
                len = Math.Max(len, seq.Count);
            }

            for (int pos0 = 0; pos0 < len; pos0++)
            {
                consensusSeq.Append(AASeq.CountAasAtPos(aaSeqs, pos0)[0].Key);
            }
            //for (int i = 0; i < len; i++)
            //{
            //    Dictionary<string, int> charToCount = new Dictionary<string, int>();
            //    KeyValuePair<string, int> currentConsensus = new KeyValuePair<string, int>("z", -1);
            //    foreach (AASeq aaSeq in aaSeqs)
            //    {
            //        if (i < aaSeq.Count)
            //        {
            //            string residue = aaSeq.SubSeqAA0Pos(i, 1).ToString();
            //            charToCount[residue] = charToCount.GetValueOrDefault(residue) + 1;
            //            if (charToCount[residue] > currentConsensus.Value)
            //            {
            //                currentConsensus = new KeyValuePair<string, int>(residue, charToCount[residue]);
            //            }
            //        }
            //    }
            //    consensusSeq.Append(currentConsensus.Key);
            //}

            AASeq consensus = AASeq.GetInstance(consensusSeq.ToString(), MixtureSemantics.Uncertainty);

            //NamedSequence consensus = new NamedSequence("consensus", consensusSeq.ToString());
            return(consensus);
        }
Пример #18
0
        static public AASeq GetCompressedInstance(string caseId, AASeq aaSeqIn, bool stopOnStop, TextWriter errorStream)
        {
            AASeq aaSeqOut = new AASeq(aaSeqIn.MixtureSemantics);

            aaSeqOut.Sequence = new List <Set <char> >();
            aaSeqOut._originalAA1PositionTableOrNull = new List <string>();
            Debug.Assert(aaSeqOut.Offset == 0); // real assert


            for (int iChar = 0; iChar < aaSeqIn.Count; ++iChar)
            {
                Set <char> set = aaSeqIn[iChar];
                string     originalAA1Position = aaSeqIn.OriginalAA1Position(iChar);
                if (set.Equals(Delete)) //!!!const
                {
                    continue;
                }
                if (set.Equals(Stop)) //!!!const
                {
                    if (iChar != aaSeqIn.Count - 1)
                    {
                        errorStream.WriteLine("Warning: The sequence for case id '{0}' contains a '*' before the last position", caseId);
                        if (stopOnStop)
                        {
                            break;
                        }
                    }
                    else
                    {
                        break;
                    }
                }
                aaSeqOut.Sequence.Add(set);
                aaSeqOut._originalAA1PositionTableOrNull.Add(originalAA1Position);
            }
            return(aaSeqOut);
        }
Пример #19
0
        /// <summary>
        /// Skips over deletes. Is allowed to be short if near end of larger sequence
        /// </summary>
        public AASeq SubSeqCenteredAA1Pos(int aa1Pos, int merLength, out Set <char> centerAA)
        {
            Helper.CheckCondition(merLength > 0, "merLength must be greater than 0");

            int?positionToStartBase1OrNull = FindStartPositionBase1OrNull(aa1Pos);

            if (positionToStartBase1OrNull == null)
            {
                centerAA = AASeq.Delete;
                return(SubSeqAA0Pos(0, 0, false));
            }
            int positionToStartBase0 = positionToStartBase1OrNull.Value - 1;

            centerAA = Sequence[positionToStartBase0];

            int goalLength = (merLength - 1) / 2;

            int leftIndex  = FindLeftPosBase0(positionToStartBase0, goalLength);
            int rightIndex = FindRightBase0(positionToStartBase0, goalLength);

            AASeq result = SubSeqAA0Pos(leftIndex, rightIndex - leftIndex + 1, true);

            return(result);
        }
Пример #20
0
        public void WriteAsTable(List <NamedSequence> sequences, TextWriter writer)
        {
            CaseIdToAASeq cidToAASeq = CaseIdToAASeq.GetInstance();
            bool          isDna      = sequences[0].IsDna();

            foreach (NamedSequence seq in sequences)
            {
                cidToAASeq.Add(seq.Name,
                               isDna ?
                               DnaSeq.GetInstance(seq.Sequence, MixtureSemantics) :
                               AASeq.GetInstance(seq.Sequence, MixtureSemantics));
            }

            List <string> header = new List <string>(sequences.Count + 1);

            header.Add("Var");
            header.AddRange(sequences.Select(seq => seq.Name));

            writer.WriteLine(header.StringJoin("\t"));

            int maxLen = cidToAASeq.Dictionary.Values.Select(aaSeq => aaSeq.Count).Max();

            for (int pos0 = 0; pos0 < maxLen; pos0++)
            {
                foreach (char aa in cidToAASeq.EveryAminoAcid(pos0))
                {
                    string        merAndPos        = (pos0 + 1) + "@" + aa;
                    int?[]        values           = new int?[sequences.Count];
                    HashSet <int> nonMissingValues = new HashSet <int>();
                    for (int pidIdx = 0; pidIdx < sequences.Count; pidIdx++)
                    {
                        int?       value;
                        Set <char> observedAAs = cidToAASeq.Dictionary[sequences[pidIdx].Name][pos0];
                        if (observedAAs.Contains('?') || observedAAs.Count == 0 ||
                            (observedAAs.Count > 1 && MixtureSemantics == MixtureSemantics.Uncertainty && observedAAs.Contains(aa)))
                        {
                            value = null;
                        }
                        else if (observedAAs.Contains(aa) && (MixtureSemantics != MixtureSemantics.Pure || observedAAs.Count == 1))
                        {
                            value = 1;
                        }
                        else
                        {
                            value = 0;
                        }

                        values[pidIdx] = value;
                        if (value != null)
                        {
                            nonMissingValues.Add((int)value);
                        }
                    }
                    if (nonMissingValues.Count > 1 || (KeepOneValueVariables && nonMissingValues.Count == 1 && nonMissingValues.First() == 1))
                    {
                        writer.WriteLine(Helper.CreateTabString(merAndPos, values.Select(v => v.HasValue ? v.ToString() : MissingStatistics.GetInstance().ToString()).StringJoin("\t")));
                    }
                }
            }


            writer.Flush();
        }
Пример #21
0
 public bool TrySubSeqAA1Pos(int aa1Pos, int merLength, out AASeq aaSeq)
 {
     return(TrySubSeqAA0Pos(aa1Pos - 1, merLength, out aaSeq));
 }
Пример #22
0
        public static Matrix <string, string, SufficientStatistics> ConvertToMatrix(List <NamedSequence> sequences, MixtureSemantics mix, BinaryOrMultistate dataType, bool keepOneValueVariables)
        {
            var colNames = sequences.Select(s => s.Name);
            //var rowNames = (from posAndAa in sequences.SelectMany(s => s.AASeq)
            //                where !AASeq.IsMissing(posAndAa.Value)
            //                let pos = posAndAa.Key
            //                let aas = posAndAa.Value
            //                from c in aas
            //                let merAndPos = pos + "@" + c
            //                orderby pos, c
            //                select merAndPos).Distinct();
            var rowNames = (from posAndAa in sequences.SelectMany(s => s.AASeq)
                            where posAndAa.Value.Count == 1 && !AASeq.IsMissing(posAndAa.Value)
                            let pos = posAndAa.Key
                                      let aas = posAndAa.Value
                                                let c = aas.First()
                                                        let merAndPos = pos + "@" + c
                                                                        orderby pos, c
                            select merAndPos).Distinct();

            var posToRowNames = (from row in rowNames
                                 let pos = (int)Tabulate.GetMerAndPos(row).Value
                                           group row by pos into g
                                           select new KeyValuePair <int, List <string> >(g.Key, g.ToList())).ToDictionary();

            Matrix <string, string, SufficientStatistics> m = DenseMatrix <string, string, SufficientStatistics> .CreateDefaultInstance(rowNames, colNames, MissingStatistics.GetInstance());

            foreach (var seq in sequences)
            {
                foreach (var posAndAa in seq.AASeq)
                {
                    int pos = posAndAa.Key;
                    if (!posToRowNames.ContainsKey(pos))
                    {
                        Helper.CheckCondition(AASeq.IsMissing(posAndAa.Value), "Something's wrong. We thinking everyone is missing at position {0}, but {1} has {2}", pos, seq.Name, posAndAa.Value);
                        continue;
                    }

                    var relevantRows = posToRowNames[pos];

                    bool isMissing = AASeq.IsMissing(posAndAa.Value);
                    var  myRows    = posAndAa.Value.Select(c => pos + "@" + c).ToList();
                    foreach (var row in relevantRows)
                    {
                        SufficientStatistics value;
                        if (isMissing)
                        {
                            value = MissingStatistics.GetInstance();
                        }
                        else if (!myRows.Contains(row))
                        {
                            value = (BooleanStatistics)false;   //in all cases, this is false
                        }
                        else if (myRows.Count == 1)
                        {
                            value = (BooleanStatistics)true;
                        }
                        else
                        {
                            switch (mix)
                            {
                            case MixtureSemantics.Any:
                                value = (BooleanStatistics)true; break;       //Any means we say you have both

                            case MixtureSemantics.Pure:
                                value = (BooleanStatistics)false; break;       //Pure means we say you have neither

                            case MixtureSemantics.Uncertainty:
                                value = MissingStatistics.GetInstance(); break;       //Uncertainty says we don't know which you have.

                            case MixtureSemantics.Distribution:
                                double pTrue = 1.0 / myRows.Count;
                                value = MultinomialStatistics.GetInstance(new double[] { 1 - pTrue, pTrue });
                                break;

                            default:
                                throw new NotImplementedException("Missing a case.");
                            }
                        }
                        m.SetValueOrMissing(row, seq.Name, value);
                    }
                }
            }

            if (!keepOneValueVariables)
            {
                m = m.SelectRowsView(m.RowKeys.Where(row => m.RowView(row).Values.Distinct().Count() > 1));
            }

            switch (dataType)
            {
            case BinaryOrMultistate.Binary:
                return(m);

            case BinaryOrMultistate.Multistate:
                return(new BinaryToMultistateView <string, string, SufficientStatistics>(m, Tabulate.BinaryToMultistateMapping(m), ValueConverter.SufficientStatisticsToMultinomial));

            default:
                throw new NotImplementedException("Missing a case");
            }
        }
Пример #23
0
        private AASeq GetFirstAASeq()
        {
            AASeq firstAASeq = this._caseIdToAASeq.Values.First();

            return(firstAASeq);
        }
Пример #24
0
        private IEnumerable <string> EveryProtein()
        {
            AASeq firstAASeq = GetFirstAASeq();

            return(firstAASeq.EveryProtein());
        }
Пример #25
0
        public IEnumerable <string> SparseLineEnumeration(bool keepOneValueVariables)
        {
            if (_caseIdToAASeq.Count == 0)
            {
                Debug.Assert(SequenceLengthOrNull == null); // real assert
                yield break;
            }
            Helper.CheckCondition(SequenceLengthOrNull != null, "This converter to sparse assumes all sequences have the same length");

            /*
             * n1pos	aa	pid	val
             * 880	A	3	F
             * 880	A	5	F
             * 880	A	9	F
             * 880	A	13	F
             * 880	A	14	F
             * 880	A	15	T
             * ...
             */


            for (int aa0Pos = 0; aa0Pos < (int)SequenceLengthOrNull; ++aa0Pos)
            {
                Set <char> everyAminoAcid = EveryAminoAcid(aa0Pos);
                if (!keepOneValueVariables && everyAminoAcid.Count == 1)
                {
                    continue;
                }

                string posName = null;
                foreach (char aa in everyAminoAcid)
                {
                    Set <bool> valueSet = Set <bool> .GetInstance();

                    Dictionary <string, bool> caseToVal = new Dictionary <string, bool>();
                    foreach (string caseId in _caseIdToAASeq.Keys)
                    {
                        AASeq aaSeq = _caseIdToAASeq[caseId];

                        if (aa0Pos >= aaSeq.Count)
                        {
                            continue;
                        }

                        //Helper.CheckCondition(aaSeq.IsUsingOriginalPositions(), "This converter to sparse assumes all sequences are using their original positions");
                        Set <char> strainAASet = aaSeq[aa0Pos];
                        if (posName == null)
                        {
                            posName = aaSeq.OriginalAA1Position(aa0Pos);
                            //if (posName.Contains("68.3B"))
                            //{
                            //    Console.WriteLine("Found it first");
                            //}
                        }
                        else
                        {
                            Helper.CheckCondition(posName == aaSeq.OriginalAA1Position(aa0Pos));
                        }
                        // missing: e.g.  A/Any   or   A/AB
                        // 1: e.g. A/A
                        // 0: e.g. A/B	or  A/BCD
                        if (strainAASet.Equals(AASeq.Any))
                        {
                            //Do nothing - missing
                        }
                        else if (strainAASet.Contains(aa))
                        {
                            if (strainAASet.Count > 1)
                            {
                                switch (aaSeq.MixtureSemantics)
                                {
                                case MixtureSemantics.Pure:
                                    caseToVal.Add(caseId, false);
                                    valueSet.AddNewOrOld(false);
                                    break;

                                case MixtureSemantics.Uncertainty:
                                    // Do nothing = missing
                                    break;

                                case MixtureSemantics.Any:
                                    caseToVal.Add(caseId, true);
                                    valueSet.AddNewOrOld(true);
                                    break;

                                default:
                                    Helper.CheckCondition(false, "Unknown mixturesemantics " + aaSeq.MixtureSemantics.ToString());
                                    break;
                                }
                            }
                            else
                            {
                                caseToVal.Add(caseId, true);
                                valueSet.AddNewOrOld(true);
                            }
                        }
                        else
                        {
                            caseToVal.Add(caseId, false);
                            valueSet.AddNewOrOld(false);
                        }
                    }
                    Helper.CheckCondition(posName != null);
                    if (keepOneValueVariables || valueSet.Count == 2)
                    {
                        foreach (KeyValuePair <string, bool> caseIdAndVal in caseToVal)
                        {
                            string variableName = string.Format("{0}@{1}", posName, aa);
                            //string variableName = string.Format("{1}@{0}", posName, aa);
                            //if (variableName.Contains("68.3B"))
                            //{
                            //    Console.WriteLine("Found it first");
                            //}
                            yield return(Helper.CreateTabString(
                                             variableName, caseIdAndVal.Key, caseIdAndVal.Value ? 1 : 0));
                        }
                    }
                }
            }
        }