Exemple #1
0
        //public void ParseArgs(ArgumentCollection argumentCollection)
        //{
        //    KeepOneValueVariables = argumentCollection.ExtractOptionalFlag("keepOneValueVariables");
        //    //MissingChar = argumentCollection.ExtractOptional<char>("missingChar", '?');

        //    //int? merLengthOrNull = argumentCollection.ExtractOptional<int?>("Mer", null); // we don't seem to actually use this...
        //    CurrentMixtureSemantics = argumentCollection.ExtractOptional<MixtureSemantics>("MixtureSemantics", MixtureSemantics.Uncertainty);
        //    //Helper.CheckCondition(null == merLengthOrNull || mixtureSemantics == MixtureSemantics.Uncertainty, "The 'Mer' option cannot be used with '-MixtureSemantics pure' or '-MixtureSemantics any'");

        //}

        //public override void ParseReadArgs(ArgumentCollection argumentCollection)
        //{
        //    base.ParseReadArgs(argumentCollection);
        //    ParseArgs(argumentCollection);
        //}

        //public override void ParseWriteArgs(ArgumentCollection argumentCollection)
        //{
        //    base.ParseWriteArgs(argumentCollection);
        //    ParseArgs(argumentCollection);
        //    Format = argumentCollection.ExtractNext<SequenceMatrix.WriteFormat>("WriteFormat");
        //    Type = argumentCollection.ExtractNext<SequenceMatrix.WriteType>("WriteFormat");
        //}

        public override List <NamedSequence> Parse(TextReader reader)
        {
            MatrixFactory <string, string, SufficientStatistics> mf = MatrixFactory <string, string, SufficientStatistics> .GetInstance();

            string filename = MBT.Escience.FileUtils.WriteTextReaderToTempFile(reader);
            var    matrix   = mf.Parse(filename, MissingStatistics.GetInstance(), new ParallelOptions());

            return(ConvertToSequences(matrix));
        }
Exemple #2
0
        protected override void WriteDistilled(IEnumerable <KeyValuePair <string, List <HlaIGenotype> > > patientsAndGenotypes, bool multipleGenotypesPerPatient)
        {
            var patientsAndGenotypesAsList = patientsAndGenotypes.ToList();

            Dictionary <string, MultinomialStatistics> pidToMultinomial  = new Dictionary <string, MultinomialStatistics>();
            Dictionary <string, HlaIGenotype>          pidGenoNameToGeno = new Dictionary <string, HlaIGenotype>(patientsAndGenotypesAsList.Count);

            foreach (var pidAndGenos in patientsAndGenotypes)
            {
                if (pidAndGenos.Value.Count == 1)
                {
                    pidGenoNameToGeno.Add(pidAndGenos.Key, pidAndGenos.Value.Single());
                }
                else
                {
                    pidToMultinomial.Add(pidAndGenos.Key, MultinomialStatistics.GetInstance(pidAndGenos.Value.Select(g => g.Probability))); // keep track of the probabilities in this dictionary

                    HlaIGenotype unionGenotype = HlaIGenotype.Union(pidAndGenos.Value.ToArray());
                    pidGenoNameToGeno.Add(pidAndGenos.Key, unionGenotype);  //Add the union genotype to the matrix.

                    for (int i = 0; i < pidAndGenos.Value.Count; i++)
                    {
                        string pidGenoId = pidAndGenos.Key + "_" + i;
                        pidGenoNameToGeno.Add(pidGenoId, pidAndGenos.Value[i]);
                    }
                }
            }

            Matrix <string, string, SufficientStatistics> m = DenseMatrix <string, string, SufficientStatistics> .CreateDefaultInstance(
                HlaEnumerator.GenerateHlas(patientsAndGenotypes.SelectMany(pidAndGenos => pidAndGenos.Value)),
                pidGenoNameToGeno.Keys,
                MissingStatistics.GetInstance());

            foreach (string hlastr in m.RowKeys)
            {
                HlaI hla = HlaI.Parse(hlastr);
                foreach (string pid in m.ColKeys)
                {
                    bool?match = pidGenoNameToGeno[pid].Matches(hla, MixtureSemantics);
                    m.SetValueOrMissing(hlastr, pid, BooleanStatistics.GetInstance(match));
                    //m.SetValueOrMissing(hlastr, pid, !match.HasValue ? "?" : match.Value ? "1" : "0");
                }
            }

            m.WriteDense(this.Out.CreateTextOrUseConsole());

            string baseName        = Path.GetFileNameWithoutExtension(this.Out.ToString());
            string probabilityFile = this.Out.ToString() == "-" ? "HaplotypeCompletionProbs.txt" : this.Out.ToString().Replace(baseName, baseName + "_haplotypeProbs");

            pidToMultinomial.WriteDelimitedFile(probabilityFile);
        }
Exemple #3
0
        public static IEnumerable <TwoByTwo> GetCollectionFromMatrices(bool unsorted, string inputMatrixFileName1, string inputMatrixFileName2, ParallelOptions parallelOptions)
        {
            MatrixFactory <string, string, SufficientStatistics> mf = MatrixFactory <string, string, SufficientStatistics> .GetInstance();

            var m1 = mf.Parse(inputMatrixFileName1, MissingStatistics.GetInstance(), parallelOptions);
            var m2 = mf.Parse(inputMatrixFileName2, MissingStatistics.GetInstance(), parallelOptions);

            List <TwoByTwo> tableList = new List <TwoByTwo>();

            foreach (string key in m1.RowKeys)
            {
                if (m2.ContainsRowKey(key))
                {
                    TwoByTwo table        = TwoByTwo.GetInstance(inputMatrixFileName1, key, false);
                    var      m1NonMissing = m1.RowView(key).Select(kvp => kvp);
                    var      m2NonMissing = m2.RowView(key).Select(kvp => kvp);

                    // m1 is T for first col, m2 is F for first Col.
                    table.TT = m1NonMissing.Select(kvp => (int)kvp.Value.AsDiscreteStatistics()).Sum();
                    table.TF = m1NonMissing.Count() - table.TT;
                    table.FT = m2NonMissing.Select(kvp => (int)kvp.Value.AsDiscreteStatistics()).Sum();
                    table.FF = m2NonMissing.Count() - table.FT;

                    if (unsorted)
                    {
                        yield return(table);
                    }
                    else
                    {
                        tableList.Add(table);
                    }
                }
            }
            if (!unsorted)
            {
                tableList.Sort((t1, t2) => t1.FisherExactTest.CompareTo(t2.FisherExactTest));
                foreach (var table in tableList)
                {
                    yield return(table);
                }
            }
        }
Exemple #4
0
        public static Matrix <string, string, SufficientStatistics> ConvertToMatrix(List <NamedSequence> sequences, MixtureSemantics mix, BinaryOrMultistate dataType, bool keepOneValueVariables)
        {
            var colNames = sequences.Select(s => s.Name);
            //var rowNames = (from posAndAa in sequences.SelectMany(s => s.AASeq)
            //                where !AASeq.IsMissing(posAndAa.Value)
            //                let pos = posAndAa.Key
            //                let aas = posAndAa.Value
            //                from c in aas
            //                let merAndPos = pos + "@" + c
            //                orderby pos, c
            //                select merAndPos).Distinct();
            var rowNames = (from posAndAa in sequences.SelectMany(s => s.AASeq)
                            where posAndAa.Value.Count == 1 && !AASeq.IsMissing(posAndAa.Value)
                            let pos = posAndAa.Key
                                      let aas = posAndAa.Value
                                                let c = aas.First()
                                                        let merAndPos = pos + "@" + c
                                                                        orderby pos, c
                            select merAndPos).Distinct();

            var posToRowNames = (from row in rowNames
                                 let pos = (int)Tabulate.GetMerAndPos(row).Value
                                           group row by pos into g
                                           select new KeyValuePair <int, List <string> >(g.Key, g.ToList())).ToDictionary();

            Matrix <string, string, SufficientStatistics> m = DenseMatrix <string, string, SufficientStatistics> .CreateDefaultInstance(rowNames, colNames, MissingStatistics.GetInstance());

            foreach (var seq in sequences)
            {
                foreach (var posAndAa in seq.AASeq)
                {
                    int pos = posAndAa.Key;
                    if (!posToRowNames.ContainsKey(pos))
                    {
                        Helper.CheckCondition(AASeq.IsMissing(posAndAa.Value), "Something's wrong. We thinking everyone is missing at position {0}, but {1} has {2}", pos, seq.Name, posAndAa.Value);
                        continue;
                    }

                    var relevantRows = posToRowNames[pos];

                    bool isMissing = AASeq.IsMissing(posAndAa.Value);
                    var  myRows    = posAndAa.Value.Select(c => pos + "@" + c).ToList();
                    foreach (var row in relevantRows)
                    {
                        SufficientStatistics value;
                        if (isMissing)
                        {
                            value = MissingStatistics.GetInstance();
                        }
                        else if (!myRows.Contains(row))
                        {
                            value = (BooleanStatistics)false;   //in all cases, this is false
                        }
                        else if (myRows.Count == 1)
                        {
                            value = (BooleanStatistics)true;
                        }
                        else
                        {
                            switch (mix)
                            {
                            case MixtureSemantics.Any:
                                value = (BooleanStatistics)true; break;       //Any means we say you have both

                            case MixtureSemantics.Pure:
                                value = (BooleanStatistics)false; break;       //Pure means we say you have neither

                            case MixtureSemantics.Uncertainty:
                                value = MissingStatistics.GetInstance(); break;       //Uncertainty says we don't know which you have.

                            case MixtureSemantics.Distribution:
                                double pTrue = 1.0 / myRows.Count;
                                value = MultinomialStatistics.GetInstance(new double[] { 1 - pTrue, pTrue });
                                break;

                            default:
                                throw new NotImplementedException("Missing a case.");
                            }
                        }
                        m.SetValueOrMissing(row, seq.Name, value);
                    }
                }
            }

            if (!keepOneValueVariables)
            {
                m = m.SelectRowsView(m.RowKeys.Where(row => m.RowView(row).Values.Distinct().Count() > 1));
            }

            switch (dataType)
            {
            case BinaryOrMultistate.Binary:
                return(m);

            case BinaryOrMultistate.Multistate:
                return(new BinaryToMultistateView <string, string, SufficientStatistics>(m, Tabulate.BinaryToMultistateMapping(m), ValueConverter.SufficientStatisticsToMultinomial));

            default:
                throw new NotImplementedException("Missing a case");
            }
        }
Exemple #5
0
        public void WriteAsTable(List <NamedSequence> sequences, TextWriter writer)
        {
            CaseIdToAASeq cidToAASeq = CaseIdToAASeq.GetInstance();
            bool          isDna      = sequences[0].IsDna();

            foreach (NamedSequence seq in sequences)
            {
                cidToAASeq.Add(seq.Name,
                               isDna ?
                               DnaSeq.GetInstance(seq.Sequence, MixtureSemantics) :
                               AASeq.GetInstance(seq.Sequence, MixtureSemantics));
            }

            List <string> header = new List <string>(sequences.Count + 1);

            header.Add("Var");
            header.AddRange(sequences.Select(seq => seq.Name));

            writer.WriteLine(header.StringJoin("\t"));

            int maxLen = cidToAASeq.Dictionary.Values.Select(aaSeq => aaSeq.Count).Max();

            for (int pos0 = 0; pos0 < maxLen; pos0++)
            {
                foreach (char aa in cidToAASeq.EveryAminoAcid(pos0))
                {
                    string        merAndPos        = (pos0 + 1) + "@" + aa;
                    int?[]        values           = new int?[sequences.Count];
                    HashSet <int> nonMissingValues = new HashSet <int>();
                    for (int pidIdx = 0; pidIdx < sequences.Count; pidIdx++)
                    {
                        int?       value;
                        Set <char> observedAAs = cidToAASeq.Dictionary[sequences[pidIdx].Name][pos0];
                        if (observedAAs.Contains('?') || observedAAs.Count == 0 ||
                            (observedAAs.Count > 1 && MixtureSemantics == MixtureSemantics.Uncertainty && observedAAs.Contains(aa)))
                        {
                            value = null;
                        }
                        else if (observedAAs.Contains(aa) && (MixtureSemantics != MixtureSemantics.Pure || observedAAs.Count == 1))
                        {
                            value = 1;
                        }
                        else
                        {
                            value = 0;
                        }

                        values[pidIdx] = value;
                        if (value != null)
                        {
                            nonMissingValues.Add((int)value);
                        }
                    }
                    if (nonMissingValues.Count > 1 || (KeepOneValueVariables && nonMissingValues.Count == 1 && nonMissingValues.First() == 1))
                    {
                        writer.WriteLine(Helper.CreateTabString(merAndPos, values.Select(v => v.HasValue ? v.ToString() : MissingStatistics.GetInstance().ToString()).StringJoin("\t")));
                    }
                }
            }


            writer.Flush();
        }