public List <NamedSequence> ConvertToSequences(Matrix <string, string, SufficientStatistics> matrix) { bool isMultistate = matrix.RowKeys.Any(key => key.Contains('#')); if (!isMultistate) { matrix = new BinaryToMultistateView <string, string, SufficientStatistics>(matrix, Tabulate.BinaryToMultistateMapping(matrix), MBT.Escience.ValueConverter.SufficientStatisticsToInt); } var rowKeysAsMerAndPos = (from key in matrix.RowKeys let merAndPos = Tabulate.GetMerAndPos(key) orderby merAndPos.Value select new { Position0 = (int)merAndPos.Value - 1, Chars = merAndPos.Key.Split('#'), Key = key }).ToList(); List <NamedSequence> result = new List <NamedSequence>(matrix.ColCount); for (int pid = 0; pid < matrix.ColCount; pid++) { StringBuilder seq = new StringBuilder(matrix.RowCount); foreach (var rowKey in rowKeysAsMerAndPos) { Helper.CheckCondition(seq.Length <= rowKey.Position0, "There appears to be multiple keys with the same position."); while (seq.Length < rowKey.Position0) { seq.Append(MissingChar); } DiscreteStatistics stats = matrix.GetValueOrMissing(rowKey.Key, matrix.ColKeys[pid]).AsDiscreteStatistics(); if (stats.IsMissing()) { seq.Append(MissingChar); } else { string thisChar; if (rowKey.Chars.Length == 1) { thisChar = rowKey.Chars[0]; Helper.CheckCondition((int)stats == 1); } else { thisChar = rowKey.Chars[stats]; } Helper.CheckCondition(thisChar.Length == 1, "State {0} is too long. Must be a single character.", thisChar); seq.Append(thisChar); } } result.Add(new NamedSequence(matrix.ColKeys[pid], seq.ToString())); } return(result); }
public static Matrix <string, string, SufficientStatistics> ConvertToMatrix(List <NamedSequence> sequences, MixtureSemantics mix, BinaryOrMultistate dataType, bool keepOneValueVariables) { var colNames = sequences.Select(s => s.Name); //var rowNames = (from posAndAa in sequences.SelectMany(s => s.AASeq) // where !AASeq.IsMissing(posAndAa.Value) // let pos = posAndAa.Key // let aas = posAndAa.Value // from c in aas // let merAndPos = pos + "@" + c // orderby pos, c // select merAndPos).Distinct(); var rowNames = (from posAndAa in sequences.SelectMany(s => s.AASeq) where posAndAa.Value.Count == 1 && !AASeq.IsMissing(posAndAa.Value) let pos = posAndAa.Key let aas = posAndAa.Value let c = aas.First() let merAndPos = pos + "@" + c orderby pos, c select merAndPos).Distinct(); var posToRowNames = (from row in rowNames let pos = (int)Tabulate.GetMerAndPos(row).Value group row by pos into g select new KeyValuePair <int, List <string> >(g.Key, g.ToList())).ToDictionary(); Matrix <string, string, SufficientStatistics> m = DenseMatrix <string, string, SufficientStatistics> .CreateDefaultInstance(rowNames, colNames, MissingStatistics.GetInstance()); foreach (var seq in sequences) { foreach (var posAndAa in seq.AASeq) { int pos = posAndAa.Key; if (!posToRowNames.ContainsKey(pos)) { Helper.CheckCondition(AASeq.IsMissing(posAndAa.Value), "Something's wrong. We thinking everyone is missing at position {0}, but {1} has {2}", pos, seq.Name, posAndAa.Value); continue; } var relevantRows = posToRowNames[pos]; bool isMissing = AASeq.IsMissing(posAndAa.Value); var myRows = posAndAa.Value.Select(c => pos + "@" + c).ToList(); foreach (var row in relevantRows) { SufficientStatistics value; if (isMissing) { value = MissingStatistics.GetInstance(); } else if (!myRows.Contains(row)) { value = (BooleanStatistics)false; //in all cases, this is false } else if (myRows.Count == 1) { value = (BooleanStatistics)true; } else { switch (mix) { case MixtureSemantics.Any: value = (BooleanStatistics)true; break; //Any means we say you have both case MixtureSemantics.Pure: value = (BooleanStatistics)false; break; //Pure means we say you have neither case MixtureSemantics.Uncertainty: value = MissingStatistics.GetInstance(); break; //Uncertainty says we don't know which you have. case MixtureSemantics.Distribution: double pTrue = 1.0 / myRows.Count; value = MultinomialStatistics.GetInstance(new double[] { 1 - pTrue, pTrue }); break; default: throw new NotImplementedException("Missing a case."); } } m.SetValueOrMissing(row, seq.Name, value); } } } if (!keepOneValueVariables) { m = m.SelectRowsView(m.RowKeys.Where(row => m.RowView(row).Values.Distinct().Count() > 1)); } switch (dataType) { case BinaryOrMultistate.Binary: return(m); case BinaryOrMultistate.Multistate: return(new BinaryToMultistateView <string, string, SufficientStatistics>(m, Tabulate.BinaryToMultistateMapping(m), ValueConverter.SufficientStatisticsToMultinomial)); default: throw new NotImplementedException("Missing a case"); } }