Beispiel #1
0
        public List <NamedSequence> ConvertToSequences(Matrix <string, string, SufficientStatistics> matrix)
        {
            bool isMultistate = matrix.RowKeys.Any(key => key.Contains('#'));

            if (!isMultistate)
            {
                matrix = new BinaryToMultistateView <string, string, SufficientStatistics>(matrix, Tabulate.BinaryToMultistateMapping(matrix), MBT.Escience.ValueConverter.SufficientStatisticsToInt);
            }

            var rowKeysAsMerAndPos = (from key in matrix.RowKeys
                                      let merAndPos = Tabulate.GetMerAndPos(key)
                                                      orderby merAndPos.Value
                                                      select new
            {
                Position0 = (int)merAndPos.Value - 1,
                Chars = merAndPos.Key.Split('#'),
                Key = key
            }).ToList();

            List <NamedSequence> result = new List <NamedSequence>(matrix.ColCount);

            for (int pid = 0; pid < matrix.ColCount; pid++)
            {
                StringBuilder seq = new StringBuilder(matrix.RowCount);
                foreach (var rowKey in rowKeysAsMerAndPos)
                {
                    Helper.CheckCondition(seq.Length <= rowKey.Position0, "There appears to be multiple keys with the same position.");
                    while (seq.Length < rowKey.Position0)
                    {
                        seq.Append(MissingChar);
                    }
                    DiscreteStatistics stats = matrix.GetValueOrMissing(rowKey.Key, matrix.ColKeys[pid]).AsDiscreteStatistics();
                    if (stats.IsMissing())
                    {
                        seq.Append(MissingChar);
                    }
                    else
                    {
                        string thisChar;
                        if (rowKey.Chars.Length == 1)
                        {
                            thisChar = rowKey.Chars[0];
                            Helper.CheckCondition((int)stats == 1);
                        }
                        else
                        {
                            thisChar = rowKey.Chars[stats];
                        }
                        Helper.CheckCondition(thisChar.Length == 1, "State {0} is too long. Must be a single character.", thisChar);
                        seq.Append(thisChar);
                    }
                }
                result.Add(new NamedSequence(matrix.ColKeys[pid], seq.ToString()));
            }

            return(result);
        }
Beispiel #2
0
        public static Matrix <string, string, SufficientStatistics> ConvertToMatrix(List <NamedSequence> sequences, MixtureSemantics mix, BinaryOrMultistate dataType, bool keepOneValueVariables)
        {
            var colNames = sequences.Select(s => s.Name);
            //var rowNames = (from posAndAa in sequences.SelectMany(s => s.AASeq)
            //                where !AASeq.IsMissing(posAndAa.Value)
            //                let pos = posAndAa.Key
            //                let aas = posAndAa.Value
            //                from c in aas
            //                let merAndPos = pos + "@" + c
            //                orderby pos, c
            //                select merAndPos).Distinct();
            var rowNames = (from posAndAa in sequences.SelectMany(s => s.AASeq)
                            where posAndAa.Value.Count == 1 && !AASeq.IsMissing(posAndAa.Value)
                            let pos = posAndAa.Key
                                      let aas = posAndAa.Value
                                                let c = aas.First()
                                                        let merAndPos = pos + "@" + c
                                                                        orderby pos, c
                            select merAndPos).Distinct();

            var posToRowNames = (from row in rowNames
                                 let pos = (int)Tabulate.GetMerAndPos(row).Value
                                           group row by pos into g
                                           select new KeyValuePair <int, List <string> >(g.Key, g.ToList())).ToDictionary();

            Matrix <string, string, SufficientStatistics> m = DenseMatrix <string, string, SufficientStatistics> .CreateDefaultInstance(rowNames, colNames, MissingStatistics.GetInstance());

            foreach (var seq in sequences)
            {
                foreach (var posAndAa in seq.AASeq)
                {
                    int pos = posAndAa.Key;
                    if (!posToRowNames.ContainsKey(pos))
                    {
                        Helper.CheckCondition(AASeq.IsMissing(posAndAa.Value), "Something's wrong. We thinking everyone is missing at position {0}, but {1} has {2}", pos, seq.Name, posAndAa.Value);
                        continue;
                    }

                    var relevantRows = posToRowNames[pos];

                    bool isMissing = AASeq.IsMissing(posAndAa.Value);
                    var  myRows    = posAndAa.Value.Select(c => pos + "@" + c).ToList();
                    foreach (var row in relevantRows)
                    {
                        SufficientStatistics value;
                        if (isMissing)
                        {
                            value = MissingStatistics.GetInstance();
                        }
                        else if (!myRows.Contains(row))
                        {
                            value = (BooleanStatistics)false;   //in all cases, this is false
                        }
                        else if (myRows.Count == 1)
                        {
                            value = (BooleanStatistics)true;
                        }
                        else
                        {
                            switch (mix)
                            {
                            case MixtureSemantics.Any:
                                value = (BooleanStatistics)true; break;       //Any means we say you have both

                            case MixtureSemantics.Pure:
                                value = (BooleanStatistics)false; break;       //Pure means we say you have neither

                            case MixtureSemantics.Uncertainty:
                                value = MissingStatistics.GetInstance(); break;       //Uncertainty says we don't know which you have.

                            case MixtureSemantics.Distribution:
                                double pTrue = 1.0 / myRows.Count;
                                value = MultinomialStatistics.GetInstance(new double[] { 1 - pTrue, pTrue });
                                break;

                            default:
                                throw new NotImplementedException("Missing a case.");
                            }
                        }
                        m.SetValueOrMissing(row, seq.Name, value);
                    }
                }
            }

            if (!keepOneValueVariables)
            {
                m = m.SelectRowsView(m.RowKeys.Where(row => m.RowView(row).Values.Distinct().Count() > 1));
            }

            switch (dataType)
            {
            case BinaryOrMultistate.Binary:
                return(m);

            case BinaryOrMultistate.Multistate:
                return(new BinaryToMultistateView <string, string, SufficientStatistics>(m, Tabulate.BinaryToMultistateMapping(m), ValueConverter.SufficientStatisticsToMultinomial));

            default:
                throw new NotImplementedException("Missing a case");
            }
        }