//public void ParseArgs(ArgumentCollection argumentCollection) //{ // KeepOneValueVariables = argumentCollection.ExtractOptionalFlag("keepOneValueVariables"); // //MissingChar = argumentCollection.ExtractOptional<char>("missingChar", '?'); // //int? merLengthOrNull = argumentCollection.ExtractOptional<int?>("Mer", null); // we don't seem to actually use this... // CurrentMixtureSemantics = argumentCollection.ExtractOptional<MixtureSemantics>("MixtureSemantics", MixtureSemantics.Uncertainty); // //Helper.CheckCondition(null == merLengthOrNull || mixtureSemantics == MixtureSemantics.Uncertainty, "The 'Mer' option cannot be used with '-MixtureSemantics pure' or '-MixtureSemantics any'"); //} //public override void ParseReadArgs(ArgumentCollection argumentCollection) //{ // base.ParseReadArgs(argumentCollection); // ParseArgs(argumentCollection); //} //public override void ParseWriteArgs(ArgumentCollection argumentCollection) //{ // base.ParseWriteArgs(argumentCollection); // ParseArgs(argumentCollection); // Format = argumentCollection.ExtractNext<SequenceMatrix.WriteFormat>("WriteFormat"); // Type = argumentCollection.ExtractNext<SequenceMatrix.WriteType>("WriteFormat"); //} public override List <NamedSequence> Parse(TextReader reader) { MatrixFactory <string, string, SufficientStatistics> mf = MatrixFactory <string, string, SufficientStatistics> .GetInstance(); string filename = MBT.Escience.FileUtils.WriteTextReaderToTempFile(reader); var matrix = mf.Parse(filename, MissingStatistics.GetInstance(), new ParallelOptions()); return(ConvertToSequences(matrix)); }
protected override void WriteDistilled(IEnumerable <KeyValuePair <string, List <HlaIGenotype> > > patientsAndGenotypes, bool multipleGenotypesPerPatient) { var patientsAndGenotypesAsList = patientsAndGenotypes.ToList(); Dictionary <string, MultinomialStatistics> pidToMultinomial = new Dictionary <string, MultinomialStatistics>(); Dictionary <string, HlaIGenotype> pidGenoNameToGeno = new Dictionary <string, HlaIGenotype>(patientsAndGenotypesAsList.Count); foreach (var pidAndGenos in patientsAndGenotypes) { if (pidAndGenos.Value.Count == 1) { pidGenoNameToGeno.Add(pidAndGenos.Key, pidAndGenos.Value.Single()); } else { pidToMultinomial.Add(pidAndGenos.Key, MultinomialStatistics.GetInstance(pidAndGenos.Value.Select(g => g.Probability))); // keep track of the probabilities in this dictionary HlaIGenotype unionGenotype = HlaIGenotype.Union(pidAndGenos.Value.ToArray()); pidGenoNameToGeno.Add(pidAndGenos.Key, unionGenotype); //Add the union genotype to the matrix. for (int i = 0; i < pidAndGenos.Value.Count; i++) { string pidGenoId = pidAndGenos.Key + "_" + i; pidGenoNameToGeno.Add(pidGenoId, pidAndGenos.Value[i]); } } } Matrix <string, string, SufficientStatistics> m = DenseMatrix <string, string, SufficientStatistics> .CreateDefaultInstance( HlaEnumerator.GenerateHlas(patientsAndGenotypes.SelectMany(pidAndGenos => pidAndGenos.Value)), pidGenoNameToGeno.Keys, MissingStatistics.GetInstance()); foreach (string hlastr in m.RowKeys) { HlaI hla = HlaI.Parse(hlastr); foreach (string pid in m.ColKeys) { bool?match = pidGenoNameToGeno[pid].Matches(hla, MixtureSemantics); m.SetValueOrMissing(hlastr, pid, BooleanStatistics.GetInstance(match)); //m.SetValueOrMissing(hlastr, pid, !match.HasValue ? "?" : match.Value ? "1" : "0"); } } m.WriteDense(this.Out.CreateTextOrUseConsole()); string baseName = Path.GetFileNameWithoutExtension(this.Out.ToString()); string probabilityFile = this.Out.ToString() == "-" ? "HaplotypeCompletionProbs.txt" : this.Out.ToString().Replace(baseName, baseName + "_haplotypeProbs"); pidToMultinomial.WriteDelimitedFile(probabilityFile); }
public static IEnumerable <TwoByTwo> GetCollectionFromMatrices(bool unsorted, string inputMatrixFileName1, string inputMatrixFileName2, ParallelOptions parallelOptions) { MatrixFactory <string, string, SufficientStatistics> mf = MatrixFactory <string, string, SufficientStatistics> .GetInstance(); var m1 = mf.Parse(inputMatrixFileName1, MissingStatistics.GetInstance(), parallelOptions); var m2 = mf.Parse(inputMatrixFileName2, MissingStatistics.GetInstance(), parallelOptions); List <TwoByTwo> tableList = new List <TwoByTwo>(); foreach (string key in m1.RowKeys) { if (m2.ContainsRowKey(key)) { TwoByTwo table = TwoByTwo.GetInstance(inputMatrixFileName1, key, false); var m1NonMissing = m1.RowView(key).Select(kvp => kvp); var m2NonMissing = m2.RowView(key).Select(kvp => kvp); // m1 is T for first col, m2 is F for first Col. table.TT = m1NonMissing.Select(kvp => (int)kvp.Value.AsDiscreteStatistics()).Sum(); table.TF = m1NonMissing.Count() - table.TT; table.FT = m2NonMissing.Select(kvp => (int)kvp.Value.AsDiscreteStatistics()).Sum(); table.FF = m2NonMissing.Count() - table.FT; if (unsorted) { yield return(table); } else { tableList.Add(table); } } } if (!unsorted) { tableList.Sort((t1, t2) => t1.FisherExactTest.CompareTo(t2.FisherExactTest)); foreach (var table in tableList) { yield return(table); } } }
public static Matrix <string, string, SufficientStatistics> ConvertToMatrix(List <NamedSequence> sequences, MixtureSemantics mix, BinaryOrMultistate dataType, bool keepOneValueVariables) { var colNames = sequences.Select(s => s.Name); //var rowNames = (from posAndAa in sequences.SelectMany(s => s.AASeq) // where !AASeq.IsMissing(posAndAa.Value) // let pos = posAndAa.Key // let aas = posAndAa.Value // from c in aas // let merAndPos = pos + "@" + c // orderby pos, c // select merAndPos).Distinct(); var rowNames = (from posAndAa in sequences.SelectMany(s => s.AASeq) where posAndAa.Value.Count == 1 && !AASeq.IsMissing(posAndAa.Value) let pos = posAndAa.Key let aas = posAndAa.Value let c = aas.First() let merAndPos = pos + "@" + c orderby pos, c select merAndPos).Distinct(); var posToRowNames = (from row in rowNames let pos = (int)Tabulate.GetMerAndPos(row).Value group row by pos into g select new KeyValuePair <int, List <string> >(g.Key, g.ToList())).ToDictionary(); Matrix <string, string, SufficientStatistics> m = DenseMatrix <string, string, SufficientStatistics> .CreateDefaultInstance(rowNames, colNames, MissingStatistics.GetInstance()); foreach (var seq in sequences) { foreach (var posAndAa in seq.AASeq) { int pos = posAndAa.Key; if (!posToRowNames.ContainsKey(pos)) { Helper.CheckCondition(AASeq.IsMissing(posAndAa.Value), "Something's wrong. We thinking everyone is missing at position {0}, but {1} has {2}", pos, seq.Name, posAndAa.Value); continue; } var relevantRows = posToRowNames[pos]; bool isMissing = AASeq.IsMissing(posAndAa.Value); var myRows = posAndAa.Value.Select(c => pos + "@" + c).ToList(); foreach (var row in relevantRows) { SufficientStatistics value; if (isMissing) { value = MissingStatistics.GetInstance(); } else if (!myRows.Contains(row)) { value = (BooleanStatistics)false; //in all cases, this is false } else if (myRows.Count == 1) { value = (BooleanStatistics)true; } else { switch (mix) { case MixtureSemantics.Any: value = (BooleanStatistics)true; break; //Any means we say you have both case MixtureSemantics.Pure: value = (BooleanStatistics)false; break; //Pure means we say you have neither case MixtureSemantics.Uncertainty: value = MissingStatistics.GetInstance(); break; //Uncertainty says we don't know which you have. case MixtureSemantics.Distribution: double pTrue = 1.0 / myRows.Count; value = MultinomialStatistics.GetInstance(new double[] { 1 - pTrue, pTrue }); break; default: throw new NotImplementedException("Missing a case."); } } m.SetValueOrMissing(row, seq.Name, value); } } } if (!keepOneValueVariables) { m = m.SelectRowsView(m.RowKeys.Where(row => m.RowView(row).Values.Distinct().Count() > 1)); } switch (dataType) { case BinaryOrMultistate.Binary: return(m); case BinaryOrMultistate.Multistate: return(new BinaryToMultistateView <string, string, SufficientStatistics>(m, Tabulate.BinaryToMultistateMapping(m), ValueConverter.SufficientStatisticsToMultinomial)); default: throw new NotImplementedException("Missing a case"); } }
public void WriteAsTable(List <NamedSequence> sequences, TextWriter writer) { CaseIdToAASeq cidToAASeq = CaseIdToAASeq.GetInstance(); bool isDna = sequences[0].IsDna(); foreach (NamedSequence seq in sequences) { cidToAASeq.Add(seq.Name, isDna ? DnaSeq.GetInstance(seq.Sequence, MixtureSemantics) : AASeq.GetInstance(seq.Sequence, MixtureSemantics)); } List <string> header = new List <string>(sequences.Count + 1); header.Add("Var"); header.AddRange(sequences.Select(seq => seq.Name)); writer.WriteLine(header.StringJoin("\t")); int maxLen = cidToAASeq.Dictionary.Values.Select(aaSeq => aaSeq.Count).Max(); for (int pos0 = 0; pos0 < maxLen; pos0++) { foreach (char aa in cidToAASeq.EveryAminoAcid(pos0)) { string merAndPos = (pos0 + 1) + "@" + aa; int?[] values = new int?[sequences.Count]; HashSet <int> nonMissingValues = new HashSet <int>(); for (int pidIdx = 0; pidIdx < sequences.Count; pidIdx++) { int? value; Set <char> observedAAs = cidToAASeq.Dictionary[sequences[pidIdx].Name][pos0]; if (observedAAs.Contains('?') || observedAAs.Count == 0 || (observedAAs.Count > 1 && MixtureSemantics == MixtureSemantics.Uncertainty && observedAAs.Contains(aa))) { value = null; } else if (observedAAs.Contains(aa) && (MixtureSemantics != MixtureSemantics.Pure || observedAAs.Count == 1)) { value = 1; } else { value = 0; } values[pidIdx] = value; if (value != null) { nonMissingValues.Add((int)value); } } if (nonMissingValues.Count > 1 || (KeepOneValueVariables && nonMissingValues.Count == 1 && nonMissingValues.First() == 1)) { writer.WriteLine(Helper.CreateTabString(merAndPos, values.Select(v => v.HasValue ? v.ToString() : MissingStatistics.GetInstance().ToString()).StringJoin("\t"))); } } } writer.Flush(); }