Пример #1
0
        private void UpdatePeptidePlots()
        {
            //if nothing is selected, plot the previous selection (if any)
            if (DisplayPeptidesDataGrid.SelectedItem == null)
            {
                if (PeptidesPreviouslyPlotted.Count != 0 && PeptidesPreviouslyPlotted.All(x => PeptidesToDisplay.Contains(x)))
                {
                    PlotPeptideData(PeptidesPreviouslyPlotted);
                }
                //else do nothing
            }
            else
            {
                // plot the selected peptide/protein
                PeptideTurnoverObject        row            = (PeptideTurnoverObject)DisplayPeptidesDataGrid.SelectedItem;
                List <PeptideTurnoverObject> peptidesToPlot = PeptidesToDisplay.Where(x => x.FullSequence.Equals(row.FullSequence) && ((DisplayProteinInSpecificTable && x.Protein.Equals(row.Protein)) || x.Proteoform.Equals(row.Proteoform))).ToList();
                for (int i = peptidesToPlot.Count - 1; i >= 0; i--)
                {
                    if (FilesToHideObservableCollection.Contains(peptidesToPlot[i].FileName))
                    {
                        peptidesToPlot.RemoveAt(i);
                    }
                }

                PlotPeptideData(peptidesToPlot);
                PeptidesPreviouslyPlotted = peptidesToPlot;
            }
        }
Пример #2
0
        public static List <PeptideTurnoverObject> GetProteinInfo(List <PeptideTurnoverObject> peptides, string filePath, List <IGrouping <string, PeptideTurnoverObject> > grouping, string analysisType)
        {
            //group peptides by protein
            List <PeptideTurnoverObject> proteinsToReturn = new List <PeptideTurnoverObject>();

            foreach (var group in grouping)
            {
                string        proteinName            = group.Key;
                var           peptidesForThisProtein = group.OrderBy(x => x.BaseSequence).ThenBy(x => x.FullSequence).ToList();
                List <double> kbis                = new List <double>();
                List <double> timepoints          = new List <double>();
                List <double> rfs                 = new List <double>();
                List <string> filenames           = new List <string>();
                List <double> intensities         = new List <double>();
                double        intensity           = 0;
                string        allPeptideSequences = "";
                foreach (var peptide in peptidesForThisProtein)
                {
                    kbis.AddRange(peptide.MonteCarloKbis);
                    timepoints.AddRange(peptide.Timepoints);
                    rfs.AddRange(peptide.RelativeFractions);
                    filenames.AddRange(peptide.Filenames);
                    intensities.AddRange(peptide.Intensities);
                    intensity           += peptide.TotalIntensity;
                    allPeptideSequences += peptide.FullSequence + ";";
                }
                //keep excel compatible
                if (allPeptideSequences.Length > 10000)
                {
                    allPeptideSequences = "Too Many Sequences";
                }
                kbis.Sort();
                //get important measurements
                PeptideTurnoverObject protein = new PeptideTurnoverObject(allPeptideSequences, timepoints.ToArray(), rfs.ToArray(),
                                                                          filenames.ToArray(), intensities.ToArray(), intensity, peptides.First().FileName, proteinName);

                //we do 200*n iterations, so we want the average half-life (not kbi) of the 99*nth and 100*nth index to get the median half-life. Then transform back to kbi
                double medianHalfLifeLow  = Math.Log(2, Math.E) / kbis[kbis.Count / 2 - 1];
                double medianHalfLifeHigh = Math.Log(2, Math.E) / kbis[kbis.Count / 2];
                double medianHalfLife     = (medianHalfLifeHigh + medianHalfLifeLow) / 2;
                protein.Kbi            = Math.Log(2, Math.E) / medianHalfLife;
                protein.LowKbi         = kbis[(int)Math.Round((kbis.Count - 1) * 2.5 / 100)];  //2.5 percentile
                protein.HighKbi        = kbis[(int)Math.Round((kbis.Count - 1) * 97.5 / 100)]; //97.5 percentile
                protein.MonteCarloKbis = kbis.ToArray();
                protein.Error          = peptidesForThisProtein.Count;                         //not the actual error
                proteinsToReturn.Add(protein);
            }

            string directory = Directory.GetParent(filePath).FullName;
            string filename  = Path.GetFileNameWithoutExtension(filePath);

            List <string> linesToWrite = new List <string>();

            linesToWrite.Add("Protein\tHalf Life\tLowerConfidenceInterval\tUpperConfidenceInterval\tSummed Intensity\tNumber of Ratios\tNumber of Peptides\tPeptideSequences");
            foreach (PeptideTurnoverObject protein in proteinsToReturn)
            {
                linesToWrite.Add(protein.Protein + '\t' + (Math.Log(2, Math.E) / protein.Kbi).ToString() + '\t' +
                                 (Math.Log(2, Math.E) / protein.HighKbi).ToString() + '\t' +
                                 (Math.Log(2, Math.E) / protein.LowKbi).ToString() + '\t' + protein.TotalIntensity.ToString() + '\t' +
                                 protein.Timepoints.Length.ToString() + '\t' + protein.Error.ToString() + '\t' + protein.FullSequence);
            }

            //output each peptide with its sequence, kbi, 95% confidence interval, and protein
            File.WriteAllLines(Path.Combine(directory, filename + "_Results", filename + "_" + analysisType + "TurnoverResults.tsv"), linesToWrite);

            return(proteinsToReturn);
        }
Пример #3
0
        public static void UpdateKbiConfidenceInterval(double kst, double kbt, double kao, PeptideTurnoverObject peptide, double ITERATIVE_SHIFT)
        {
            double[] timepoints = peptide.Timepoints;
            //int numMeasurements = timepoints.Length;
            double[] relativeFraction = peptide.RelativeFractions;
            //double[] residuals = new double[numMeasurements];
            double originalKbi = peptide.Kbi; //save it so it doesn't get overwritten
            //double[] distinctPredictedValues = PredictRelativeFractionUsingThreeCompartmentModel(kst, kbt, kao, peptide.Kbi, timepoints).Distinct().ToArray();
            //double[] uniqueTimePoints = timepoints.Distinct().ToArray();
            List <List <double> > relativeFractionsForEachTimePoint = new List <List <double> >();
            double previousTimepoint = timepoints[0];

            int previousI = 0;

            //get predicted values
            double[] predicted = PredictRelativeFractionUsingThreeCompartmentModel(kst, kbt, kao, peptide.Kbi, timepoints);
            //find sigma for each timepoint

            List <double> sigmasForEachTimePoint = new List <double>();
            List <double> sigmasForThisTimepoint = new List <double> {
                Math.Pow((relativeFraction[0] - predicted[0]), 2)
            };
            List <double> relativeFractionsForThisTimepoint = new List <double> {
                relativeFraction[0]
            };

            for (int i = 1; i < timepoints.Length; i++)
            {
                if (timepoints[i].Equals(previousTimepoint))
                {
                    sigmasForThisTimepoint.Add(Math.Pow((relativeFraction[i] - predicted[i]), 2));
                    relativeFractionsForThisTimepoint.Add(relativeFraction[i]);
                }
                else
                {
                    for (; previousI < i; previousI++)
                    {
                        sigmasForEachTimePoint.Add(Math.Sqrt(sigmasForThisTimepoint.Sum() / (Math.Max(sigmasForThisTimepoint.Count - 1, 1)))); //prevent infinity if only one point
                        relativeFractionsForEachTimePoint.Add(relativeFractionsForThisTimepoint);
                    }
                    sigmasForThisTimepoint = new List <double> {
                        Math.Pow((relativeFraction[i] - predicted[i]), 2)
                    };
                    relativeFractionsForThisTimepoint = new List <double> {
                        relativeFraction[i]
                    };
                    previousTimepoint = timepoints[i];
                }
            }
            for (; previousI < timepoints.Length; previousI++)
            {
                sigmasForEachTimePoint.Add(Math.Sqrt(sigmasForThisTimepoint.Sum() / (Math.Max(sigmasForThisTimepoint.Count - 1, 1)))); //prevent infinity if only one point
                relativeFractionsForEachTimePoint.Add(relativeFractionsForThisTimepoint);                                              //add the last one
            }
            ////BELOW CODE ONLY CREATES ONE LIST FOR EACH UNIQUE TIMEPOINT
            //List<double> relativeFractionsForThisTimepoint = new List<double> { relativeFraction[0] };
            //for(int i=1; i<timepoints.Length;i++)
            //{
            //    if(timepoints[i].Equals(previousTimepoint))
            //    {
            //        relativeFractionsForThisTimepoint.Add(relativeFraction[i]);
            //    }
            //    else
            //    {
            //        relativeFractionsForEachTimePoint.Add(relativeFractionsForThisTimepoint);
            //        relativeFractionsForThisTimepoint = new List<double> { relativeFraction[i] };
            //        previousTimepoint = timepoints[i];
            //    }
            //}
            //relativeFractionsForEachTimePoint.Add(relativeFractionsForThisTimepoint); //add the last one

            //use as an index
            double originalError = peptide.Error;
            //for (int i = 0; i < numMeasurements; i++)
            //{
            //    residuals[i] = relativeFraction[i] - predictedValues[i];
            //}

            int NUM_SIMULATIONS = 200;
            int lowPercentile   = (int)Math.Round((NUM_SIMULATIONS - 1) * 2.5 / 100);  //2.5 percentile
            int highPercentile  = (int)Math.Round((NUM_SIMULATIONS - 1) * 97.5 / 100); //97.5 percentile

            //simulate data through monte carlo (or bootstrapping, commented out)
            Random rng = new Random(1);

            //double sigma = Math.Sqrt(peptide.Error * timepoints.Length / (timepoints.Length - 4)); //The error is actually the MSE, we want SSE, so multiply, by timepoints, divide by degrees of freedom with 4 free variables
            double[] bootstrapKbis = new double[NUM_SIMULATIONS];
            for (int s = 0; s < NUM_SIMULATIONS; s++)
            {
                double[] simulatedData = new double[timepoints.Length];
                for (int i = 0; i < timepoints.Length; i++)
                {
                    //bootstrap
                    //simulatedData[i] = predictedValues[i] + residuals[rng.Next(numMeasurements)];
                    //monte carlo
                    List <double> actualValuesForThisTimepoint = relativeFractionsForEachTimePoint[i];
                    //take an actual value (sampling with replacement) and then add variance based on the residual
                    double sampleValue    = actualValuesForThisTimepoint[rng.Next(actualValuesForThisTimepoint.Count)];
                    double varianceToAdd  = NormInv(rng.NextDouble(), 0, sigmasForEachTimePoint[i]);
                    double simulatedValue = sampleValue + varianceToAdd;
                    //don't allow the simulated value to be an unreal value
                    if (simulatedValue > 1)
                    {
                        simulatedValue = 1;
                    }
                    if (simulatedValue < 0)
                    {
                        simulatedValue = 0;
                    }
                    simulatedData[i] = simulatedValue;
                    //simulatedData[i] = distinctPredictedValues[i] + NormInv(rng.NextDouble(), 0, sigma);
                }
                peptide.RelativeFractions = simulatedData;

                //optimize on the simulated data
                UpdateKbi(kst, kbt, kao, peptide, ITERATIVE_SHIFT);
                UpdateKbi(kst, kbt, kao, peptide, ITERATIVE_SHIFT / 10);
                UpdateKbi(kst, kbt, kao, peptide, ITERATIVE_SHIFT / 100);
                bootstrapKbis[s] = peptide.Kbi;
            }
            ////THE BELOW COMMENTED CODE ALLOWED FOR ONLY ONE SIMULATED POINT PER TIMEPOINT
            //double sigma = Math.Sqrt(peptide.Error * timepoints.Length / (timepoints.Length - 4)); //The error is actually the MSE, we want SSE, so multiply, by timepoints, divide by degrees of freedom with 4 free variables
            //double[] bootstrapKbis = new double[NUM_SIMULATIONS];
            //peptide.Timepoints = uniqueTimePoints;
            //for (int s = 0; s < NUM_SIMULATIONS; s++)
            //{
            //    double[] simulatedData = new double[uniqueTimePoints.Length];
            //    for (int i = 0; i < uniqueTimePoints.Length; i++)
            //    {
            //        //bootstrap
            //        //simulatedData[i] = predictedValues[i] + residuals[rng.Next(numMeasurements)];
            //        //monte carlo
            //        List<double> actualValuesForThisTimepoint = relativeFractionsForEachTimePoint[i];
            //        simulatedData[i] = actualValuesForThisTimepoint[rng.Next(actualValuesForThisTimepoint.Count)] + NormInv(rng.NextDouble(), 0, sigma);
            //        //simulatedData[i] = distinctPredictedValues[i] + NormInv(rng.NextDouble(), 0, sigma);
            //    }
            //    peptide.RelativeFractions = simulatedData;

            //    //optimize on the simulated data
            //    UpdateKbi(kst, kbt, kao, peptide, ITERATIVE_SHIFT);
            //    bootstrapKbis[s] = peptide.Kbi;
            //}
            peptide.Kbi = originalKbi;
            peptide.RelativeFractions = relativeFraction;
            peptide.Timepoints        = timepoints;
            peptide.Error             = originalError;
            Array.Sort(bootstrapKbis);
            peptide.LowKbi         = bootstrapKbis[lowPercentile];  //2.5 percentile
            peptide.HighKbi        = bootstrapKbis[highPercentile]; //97.5 percentile
            peptide.MonteCarloKbis = bootstrapKbis;
        }
Пример #4
0
        public static void UpdateKbi(double kst, double kbt, double kao, PeptideTurnoverObject peptide, double ITERATIVE_SHIFT = ITERATIVE_SHIFT)
        {
            //List<PeptideTurnoverValues> peptidesToRemove = new List<PeptideTurnoverValues>();
            const double MAX_KBI = 2.0;
            double       MIN_KBI = ITERATIVE_SHIFT * 10;
            double       ARBITRARY_GRADIENT_FACTOR = 50 * ITERATIVE_SHIFT;

            double[] timepoints        = peptide.Timepoints;
            double[] relativeFractions = peptide.RelativeFractions;
            double   kbi = peptide.Kbi;
            // if (peptide.Error==double.PositiveInfinity) //first time
            {
                peptide.Error          = CalculateErrorForThreeCompartmentModelFit(kst, kbt, kao, kbi, timepoints, relativeFractions);
                peptide.TemporaryError = peptide.Error;
            }
            double originalError = peptide.Error;

            double updatedError = CalculateErrorForThreeCompartmentModelFit(kst, kbt, kao, kbi + ITERATIVE_SHIFT, timepoints, relativeFractions);
            bool   increaseKbi  = true;

            if (!(updatedError < originalError))
            {
                updatedError = CalculateErrorForThreeCompartmentModelFit(kst, kbt, kao, kbi - ITERATIVE_SHIFT, timepoints, relativeFractions);
                increaseKbi  = false;
            }

            if (originalError > updatedError)
            {
                double diff = (originalError - updatedError) / ARBITRARY_GRADIENT_FACTOR;
                if (diff > ITERATIVE_SHIFT * 3)
                {
                    diff = Math.Round(diff / ITERATIVE_SHIFT) * ITERATIVE_SHIFT;
                    double tempError;
                    if (increaseKbi)
                    {
                        if (kbi + diff > MAX_KBI)
                        {
                            diff = MAX_KBI - diff - ITERATIVE_SHIFT;
                        }
                        tempError = CalculateErrorForThreeCompartmentModelFit(kst, kbt, kao, kbi + diff, timepoints, relativeFractions);
                    }
                    else
                    {
                        if (kbi - diff < MIN_KBI)
                        {
                            diff = kbi - MIN_KBI - ITERATIVE_SHIFT;
                        }
                        tempError = CalculateErrorForThreeCompartmentModelFit(kst, kbt, kao, kbi - diff, timepoints, relativeFractions);
                    }

                    if (!(tempError < updatedError))
                    {
                        diff = ITERATIVE_SHIFT;
                    }
                    else
                    {
                        updatedError = tempError;
                    }
                }
                else
                {
                    diff = ITERATIVE_SHIFT;
                }

                if (increaseKbi)
                {
                    kbi += diff;
                }
                else
                {
                    kbi -= diff;
                }

                while (originalError > updatedError)
                {
                    if (kbi > MAX_KBI || kbi < MIN_KBI) //max and min allowed
                    {
                        //peptidesToRemove.Add(peptide);
                        break;
                    }
                    originalError = updatedError;

                    updatedError = CalculateErrorForThreeCompartmentModelFit(kst, kbt, kao, kbi + ITERATIVE_SHIFT, timepoints, relativeFractions);
                    increaseKbi  = true;
                    if (!(updatedError < originalError))
                    {
                        updatedError = CalculateErrorForThreeCompartmentModelFit(kst, kbt, kao, kbi - ITERATIVE_SHIFT, timepoints, relativeFractions);
                        increaseKbi  = false;
                    }

                    diff = (originalError - updatedError) / ARBITRARY_GRADIENT_FACTOR;
                    if (diff > ITERATIVE_SHIFT * 3)
                    {
                        diff = Math.Round(diff / ITERATIVE_SHIFT) * ITERATIVE_SHIFT;
                        double tempError;
                        if (increaseKbi)
                        {
                            if (kbi + diff > MAX_KBI)
                            {
                                diff = MAX_KBI - diff - ITERATIVE_SHIFT;
                            }
                            tempError = CalculateErrorForThreeCompartmentModelFit(kst, kbt, kao, kbi + diff, timepoints, relativeFractions);
                        }
                        else
                        {
                            if (kbi - diff < MIN_KBI)
                            {
                                diff = kbi - MIN_KBI - ITERATIVE_SHIFT;
                            }
                            tempError = CalculateErrorForThreeCompartmentModelFit(kst, kbt, kao, kbi - diff, timepoints, relativeFractions);
                        }

                        if (!(tempError < updatedError))
                        {
                            diff = ITERATIVE_SHIFT;
                        }
                        else
                        {
                            updatedError = tempError;
                        }
                    }
                    else
                    {
                        diff = ITERATIVE_SHIFT;
                    }

                    if (increaseKbi)
                    {
                        kbi += diff;
                    }
                    else
                    {
                        kbi -= diff;
                    }
                }
                peptide.TemporaryError = originalError;

                if (increaseKbi)
                {
                    kbi -= diff;
                }
                else
                {
                    kbi += diff;
                }
                peptide.Kbi = kbi;
            }

            peptide.UpdateError();
        }
Пример #5
0
        public static bool LoadExistingResults(string inputFile, string fileToLoad, Dictionary <string, PoolParameters> poolParameterDictionary, ObservableCollection <PeptideTurnoverObject> peptides, List <PeptideTurnoverObject> proteins, List <PeptideTurnoverObject> proteoforms)
        {
            try
            {
                string[] lines = File.ReadAllLines(fileToLoad);
                //We need to read in the:
                //-pool parameters
                double[] poolParams = lines[0].Split('\t').Select(x => Convert.ToDouble(x)).ToArray();
                poolParameterDictionary[inputFile] = new PoolParameters(poolParams[0], poolParams[1], poolParams[2]);

                //-peptides
                int i = 1;
                for (; i < lines.Length; i++)
                {
                    string[] line = lines[i].Split('\t').ToArray();
                    if (line.Length == 1)
                    {
                        i++;
                        break;
                    }
                    PeptideTurnoverObject peptide = new PeptideTurnoverObject(
                        line[0],
                        line[1].Split(';').Select(x => Convert.ToDouble(x)).ToArray(),
                        line[2].Split(';').Select(x => Convert.ToDouble(x)).ToArray(),
                        line[3].Split(';'),
                        line[4].Split(';').Select(x => Convert.ToDouble(x)).ToArray(),
                        Convert.ToDouble(line[5]),
                        inputFile, //file
                        line[7],
                        line[8]);
                    peptide.Kbi     = Convert.ToDouble(line[9]);
                    peptide.Error   = Convert.ToDouble(line[10]);
                    peptide.LowKbi  = Convert.ToDouble(line[11]);
                    peptide.HighKbi = Convert.ToDouble(line[12]);
                    peptides.Add(peptide);
                }
                //-Proteins
                for (; i < lines.Length; i++)
                {
                    string[] line = lines[i].Split('\t').ToArray();
                    if (line.Length == 1)
                    {
                        i++;
                        break;
                    }
                    PeptideTurnoverObject protein = new PeptideTurnoverObject(
                        line[0],
                        line[1].Split(';').Select(x => Convert.ToDouble(x)).ToArray(),
                        line[2].Split(';').Select(x => Convert.ToDouble(x)).ToArray(),
                        line[3].Split(';'),
                        line[4].Split(';').Select(x => Convert.ToDouble(x)).ToArray(),
                        Convert.ToDouble(line[5]),
                        inputFile, //file
                        line[7]);
                    protein.Kbi     = Convert.ToDouble(line[8]);
                    protein.LowKbi  = Convert.ToDouble(line[9]);
                    protein.HighKbi = Convert.ToDouble(line[10]);
                    proteins.Add(protein);
                }
                //-Proteoforms
                for (; i < lines.Length; i++)
                {
                    string[] line = lines[i].Split('\t').ToArray();

                    PeptideTurnoverObject proteoform = new PeptideTurnoverObject(
                        line[0],
                        line[1].Split(';').Select(x => Convert.ToDouble(x)).ToArray(),
                        line[2].Split(';').Select(x => Convert.ToDouble(x)).ToArray(),
                        line[3].Split(';'),
                        line[4].Split(';').Select(x => Convert.ToDouble(x)).ToArray(),
                        Convert.ToDouble(line[5]),
                        inputFile, //file
                        line[7]);
                    proteoform.Kbi     = Convert.ToDouble(line[8]);
                    proteoform.LowKbi  = Convert.ToDouble(line[9]);
                    proteoform.HighKbi = Convert.ToDouble(line[10]);
                    proteoforms.Add(proteoform);
                }
                return(true);
            }
            catch
            {
                return(false);
            }
        }
Пример #6
0
        public static List <PeptideTurnoverObject> ReadData(string file, Settings settings, List <Protein> theoreticalProteins)
        {
            List <string> columnIndexToSampleName = new List <string>();
            List <double> columnIndexToTimepoint  = new List <double>();
            Dictionary <string, double>         sampleToTimepoint  = new Dictionary <string, double>();
            Dictionary <double, List <string> > timepointToSamples = new Dictionary <double, List <string> >();

            List <PeptideTurnoverObject> peptides = new List <PeptideTurnoverObject>();

            if (settings.UpstreamProgram == Settings.SearchEngine.MetaMorpheus)
            {
                int      firstIntensityIndex = -1;
                string[] lines  = File.ReadAllLines(file);
                string[] header = lines[0].Split('\t');
                //Get the header information for which columns are which samples and which timepoints
                for (int i = 0; i < header.Length; i += 2)
                {
                    if (!header[i].Contains("Intensity_"))
                    {
                        i++;
                        while (i < header.Length && !header[i].Contains("Intensity_"))
                        {
                            i++;
                        }
                        if (firstIntensityIndex == -1)
                        {
                            firstIntensityIndex = i;
                        }
                        i -= 2;
                    }
                    else
                    {
                        string   columnHeader      = header[i].Replace('-', '_');
                        string[] columnHeaderSplit = columnHeader.Split('_');
                        string   sampleName        = columnHeader.Substring(0, columnHeader.Length - columnHeaderSplit[columnHeaderSplit.Length - 1].Length - 1);
                        columnIndexToSampleName.Add(sampleName);
                        for (int j = 0; j < columnHeaderSplit.Length; j++)
                        {
                            string timepointString = columnHeaderSplit[j];
                            double time            = -1;
                            if (timepointString[0] == 'd' || timepointString[0] == 'D')
                            {
                                try
                                {
                                    time = Convert.ToDouble(timepointString.Substring(1));
                                }
                                catch { };
                            }
                            else if (timepointString[timepointString.Length - 1] == 'd' || timepointString[timepointString.Length - 1] == 'D')
                            {
                                try
                                {
                                    time = Convert.ToDouble(timepointString.Substring(0, timepointString.Length - 1));
                                }
                                catch { };
                            }
                            if (time != -1)
                            {
                                if (timepointToSamples.ContainsKey(time))
                                {
                                    timepointToSamples[time].Add(sampleName);
                                }
                                else
                                {
                                    timepointToSamples[time] = new List <string> {
                                        sampleName
                                    };
                                }
                                sampleToTimepoint[sampleName] = time;
                                columnIndexToTimepoint.Add(time);
                                break;
                            }
                        }
                    }
                }

                //Diagnostics
                //int numOriginalPeps = 0;
                //int numSurvivingPeps = 0;
                ////List<string> originalProteins = new List<string>();
                ////List<string> survivingProteins = new List<string>();
                //int[] originalDistribution = new int[25];
                //int[] postBadRemovalDistribution = new int[25];
                //int[] postRequirementPerTimepoint = new int[25];
                //HashSet<string> originalProteins = new HashSet<string>();
                //HashSet<string> survivingProteins = new HashSet<string>();

                //10 investigate
                //int[] num3 = new int[6];
                //int[] num7 = new int[6];
                //int[] num14 = new int[6];
                //int[] num30 = new int[6];
                //int[] num60 = new int[6];
                //List<(double, string)> peptidesWith10Values = new List<(double, string)>();


                //read in the intensities
                bool peptideInput = header[0].Equals("Sequence");
                for (int i = 1; i < lines.Length; i++)
                {
                    string[]      line     = lines[i].Split('\t');
                    string        sequence = line[0];
                    string        protein  = peptideInput ? line[2] : line[0];
                    List <double> timepointsForThisPeptide  = new List <double>();
                    List <double> rfValuesForThisPeptide    = new List <double>();
                    List <string> filenamesForThisPeptide   = new List <string>();
                    List <double> intensitiesForThisPeptide = new List <double>();

                    //Diagnostics
                    //numOriginalPeps++;
                    //originalProteins.Add(protein);
                    int    numOriginal       = 0;
                    int    numPostBadRemoval = 0;
                    int    numPostRequirementPerTimepoint = 0;
                    double highestIntensity = 0;
                    double averageIntensity = 0;

                    for (int column = firstIntensityIndex; column < firstIntensityIndex + columnIndexToSampleName.Count * 2; column += 2)
                    {
                        double originalIntensity         = line[column].Length == 0 ? 0 : Convert.ToDouble(line[column]);
                        double newlySynthesizedIntensity = line[column + 1].Length == 0 ? 0 : Convert.ToDouble(line[column + 1]);
                        bool   atLeastOneIntensity       = originalIntensity != 0 || newlySynthesizedIntensity != 0;
                        if (atLeastOneIntensity)
                        {
                            numOriginal++;
                            averageIntensity += originalIntensity + newlySynthesizedIntensity;
                            if (originalIntensity + newlySynthesizedIntensity > highestIntensity)
                            {
                                highestIntensity = originalIntensity + newlySynthesizedIntensity;
                            }
                        }
                        if ((settings.UseBadRatios && atLeastOneIntensity) || (originalIntensity != 0 && newlySynthesizedIntensity != 0))
                        {
                            numPostBadRemoval++;
                            int indexLookup = (column - firstIntensityIndex) / 2;
                            timepointsForThisPeptide.Add(columnIndexToTimepoint[indexLookup]);
                            rfValuesForThisPeptide.Add(newlySynthesizedIntensity / (originalIntensity + newlySynthesizedIntensity));
                            filenamesForThisPeptide.Add(columnIndexToSampleName[indexLookup]);
                            intensitiesForThisPeptide.Add(originalIntensity + newlySynthesizedIntensity);
                        }
                    }
                    //if (numOriginal == 10)
                    //{
                    //    num3[timepointsForThisPeptide.Count(x => x > 2 && x < 4)]++;
                    //    num7[timepointsForThisPeptide.Count(x => x > 6 && x < 8)]++;
                    //    num14[timepointsForThisPeptide.Count(x => x > 13 && x < 15)]++;
                    //    num30[timepointsForThisPeptide.Count(x => x > 29 && x < 31)]++;
                    //    num60[timepointsForThisPeptide.Count(x => x > 59 && x < 61)]++;
                    //    peptidesWith10Values.Add((intensitiesForThisPeptide.Median(), sequence));
                    //}
                    //remove timepoints with too little data
                    foreach (double timepoint in timepointToSamples.Keys)
                    {
                        List <int> indicesForThisTimepoint = new List <int>();
                        for (int index = 0; index < timepointsForThisPeptide.Count; index++)
                        {
                            if (timepointsForThisPeptide[index].Equals(timepoint))
                            {
                                numPostRequirementPerTimepoint++;
                                indicesForThisTimepoint.Add(index);
                            }
                        }
                        if (indicesForThisTimepoint.Count < settings.MinValidValuesPerTimepoint)
                        {
                            for (int index = indicesForThisTimepoint.Count - 1; index >= 0; index--)
                            {
                                numPostRequirementPerTimepoint--;
                                int actualIndex = indicesForThisTimepoint[index];
                                timepointsForThisPeptide.RemoveAt(actualIndex);
                                rfValuesForThisPeptide.RemoveAt(actualIndex);
                                filenamesForThisPeptide.RemoveAt(actualIndex);
                                intensitiesForThisPeptide.RemoveAt(actualIndex);
                            }
                        }
                    }

                    if (timepointsForThisPeptide.Count >= settings.MinValidValuesTotal)
                    {
                        //    numSurvivingPeps++;
                        //    survivingProteins.Add(protein);
                        peptides.Add(new PeptideTurnoverObject(sequence, timepointsForThisPeptide.ToArray(), rfValuesForThisPeptide.ToArray(),
                                                               filenamesForThisPeptide.ToArray(), intensitiesForThisPeptide.ToArray(), intensitiesForThisPeptide.Sum(), file, protein));
                    }
                    //originalDistribution[numOriginal]++;

                    //postBadRemovalDistribution[numPostBadRemoval]++;
                    //postRequirementPerTimepoint[numPostRequirementPerTimepoint]++;
                }
                string pathToWrite = file.Substring(0, file.Length - 4) + "_Results";
                Directory.CreateDirectory(pathToWrite);

                //DIAGNOSTICS
                //List<string> linesToWrite = new List<string>();
                //linesToWrite.Add("Original Peptides:\t" + numOriginalPeps.ToString());
                //linesToWrite.Add("Surviving Peptides:\t" + numSurvivingPeps.ToString());
                //linesToWrite.Add("Original Proteins:\t" + originalProteins.Count.ToString());
                //linesToWrite.Add("Surviving Proteins:\t" + survivingProteins.Count.ToString());
                //linesToWrite.Add("");
                //linesToWrite.Add("NumPeptides\tOriginal\tPostBadRatios\tPostTimepointMin");
                //for (int i = 0; i < 25; i++)
                //{
                //    linesToWrite.Add(i.ToString() + '\t' + originalDistribution[i].ToString() + '\t' + postBadRemovalDistribution[i].ToString() + '\t' + postRequirementPerTimepoint[i].ToString());
                //}
                //string filename = Path.GetFileNameWithoutExtension(file);

                //pathToWrite = Path.Combine(pathToWrite, filename);
                //File.WriteAllLines(pathToWrite + "_Diagnostics.tsv", linesToWrite);

                //linesToWrite.Clear();
                //linesToWrite.Add("\t3\t7\t14\t30\t60");
                //for(int i=0; i<6; i++)
                //{
                //    linesToWrite.Add(i.ToString() + '\t' + num3[i].ToString() + '\t' + num7[i].ToString() + '\t' + num14[i].ToString() + '\t' + num30[i].ToString() + '\t' + num60[i].ToString());
                //}
                //peptidesWith10Values = peptidesWith10Values.OrderByDescending(x => x.Item1).ToList();
                //linesToWrite.Add("");
                //linesToWrite.Add(peptidesWith10Values[0].Item2);
                //linesToWrite.Add(peptidesWith10Values[1].Item2);
                //linesToWrite.Add(peptidesWith10Values[2].Item2);
                //linesToWrite.Add(peptidesWith10Values[3].Item2);
                //linesToWrite.Add(peptidesWith10Values[4].Item2);
                //File.WriteAllLines(pathToWrite + "_DiagnosticsFor10ValidFiles.tsv", linesToWrite);
                //int a = 0;
            }
            else //if maxquant (not really maxquant, but the same format used in Alevra, M.; Mandad, S.; Ischebeck, T.; Urlaub, H.; Rizzoli, S. O.; Fornasiero, E. F. A Mass Spectrometry Workflow for Measuring Protein Turnover Rates in Vivo. Nature Protocols 2019. https://doi.org/10.1038/s41596-019-0222-y.)
            {
                int      firstRatioIndex     = -1;
                int      firstIntensityIndex = -1;
                string[] lines  = File.ReadAllLines(file);
                string[] header = lines[0].Split('\t');

                bool peptideInput = header[0].Equals("Sequence");

                //Get the header information for which columns are which samples and which timepoints
                if (peptideInput)
                {
                    for (int i = 0; i < header.Length; i += 6) //6 is the spacing for the actual, normalized, variablitiy, count, iso, type
                    {
                        if (!header[i].Contains("Ratio H/L"))
                        {
                            i++;
                            while (i < header.Length && !header[i].Contains("Ratio H/L"))
                            {
                                if (header[i].Contains("Intensity")) //intensities are after the ratios. Reason unknown, they do not match the ratios
                                {
                                    firstIntensityIndex = i + 3;
                                    i = header.Length; //end
                                }
                                i++;
                            }
                            if (firstRatioIndex == -1)
                            {
                                firstRatioIndex = i + 6;
                            }
                            //skip the first one because it's the aggregate
                        }
                        else
                        {
                            string   sampleName        = header[i].Replace('-', '_').Replace(' ', '_').Substring("Ratio H/L ".Length);
                            string[] columnHeaderSplit = sampleName.Split('_');
                            columnIndexToSampleName.Add(sampleName);

                            double time = Convert.ToDouble(columnHeaderSplit[0]);
                            if (timepointToSamples.ContainsKey(time))
                            {
                                timepointToSamples[time].Add(sampleName);
                            }
                            else
                            {
                                timepointToSamples[time] = new List <string> {
                                    sampleName
                                };
                            }
                            sampleToTimepoint[sampleName] = time;
                            columnIndexToTimepoint.Add(time);
                        }
                    }

                    //read in the intensities
                    for (int i = 1; i < lines.Length; i++)
                    {
                        string[]      line     = lines[i].Split('\t');
                        string        sequence = line[0];
                        string        protein  = line[34];
                        List <double> timepointsForThisPeptide  = new List <double>();
                        List <double> rfValuesForThisPeptide    = new List <double>();
                        List <string> filenamesForThisPeptide   = new List <string>();
                        List <double> intensitiesForThisPeptide = new List <double>();
                        for (int column = firstRatioIndex; column < firstRatioIndex + columnIndexToSampleName.Count * 6; column += 6)
                        {
                            if (!line[column].Equals("NaN"))
                            {
                                int indexLookup = (column - firstRatioIndex) / 6;
                                timepointsForThisPeptide.Add(columnIndexToTimepoint[indexLookup]);

                                double ratio = Convert.ToDouble(line[column]);
                                rfValuesForThisPeptide.Add(1 - ratio / (ratio + 1)); //convert H/L to L/Total //TODO remove the "1-" for normal experiments before release
                                filenamesForThisPeptide.Add(columnIndexToSampleName[indexLookup]);
                                intensitiesForThisPeptide.Add(Convert.ToDouble(line[(column - firstRatioIndex) / 6 + firstIntensityIndex]));
                            }
                        }
                        //remove timepoints with too little data
                        foreach (double timepoint in timepointToSamples.Keys)
                        {
                            List <int> indicesForThisTimepoint = new List <int>();
                            for (int index = 0; index < timepointsForThisPeptide.Count; index++)
                            {
                                if (timepointsForThisPeptide[index].Equals(timepoint))
                                {
                                    indicesForThisTimepoint.Add(index);
                                }
                            }
                            if (indicesForThisTimepoint.Count < settings.MinValidValuesPerTimepoint)
                            {
                                for (int index = indicesForThisTimepoint.Count - 1; index >= 0; index--)
                                {
                                    int actualIndex = indicesForThisTimepoint[index];
                                    timepointsForThisPeptide.RemoveAt(actualIndex);
                                    rfValuesForThisPeptide.RemoveAt(actualIndex);
                                    filenamesForThisPeptide.RemoveAt(actualIndex);
                                    intensitiesForThisPeptide.RemoveAt(actualIndex);
                                }
                            }
                        }

                        if (timepointsForThisPeptide.Count >= settings.MinValidValuesTotal)
                        {
                            peptides.Add(new PeptideTurnoverObject(sequence, timepointsForThisPeptide.ToArray(), rfValuesForThisPeptide.ToArray(),
                                                                   filenamesForThisPeptide.ToArray(), intensitiesForThisPeptide.ToArray(), intensitiesForThisPeptide.Sum(), file, protein));
                        }
                    }
                }
                else //if protein input
                {
                    for (int i = 0; i < header.Length; i++) //no spacing here
                    {
                        if (!header[i].Contains("Ratio H/L"))
                        {
                            i++;
                            while (i < header.Length && !header[i].Contains("Ratio H/L"))
                            {
                                if (header[i].Contains("Intensity")) //intensities are after the ratios. Reason unknown, they do not match the ratios
                                {
                                    firstIntensityIndex = i + 3;
                                    i = header.Length; //end
                                }
                                i++;
                            }
                            if (firstRatioIndex == -1)
                            {
                                firstRatioIndex = i;
                            }
                            //skip the first one because it's the aggregate
                        }
                        else
                        {
                            if (header[i].Contains("normalized"))
                            {
                                while (i < header.Length)
                                {
                                    if (header[i].Contains("Intensity")) //intensities are after the ratios. Reason unknown, they do not match the ratios
                                    {
                                        firstIntensityIndex = i + 3;
                                        i = header.Length; //end
                                    }
                                    i++;
                                }
                                break;
                            }
                            string   sampleName        = header[i].Replace('-', '_').Replace(' ', '_').Substring("Ratio H/L ".Length);
                            string[] columnHeaderSplit = sampleName.Split('_');
                            columnIndexToSampleName.Add(sampleName);

                            double time = Convert.ToDouble(columnHeaderSplit[0]);
                            if (timepointToSamples.ContainsKey(time))
                            {
                                timepointToSamples[time].Add(sampleName);
                            }
                            else
                            {
                                timepointToSamples[time] = new List <string> {
                                    sampleName
                                };
                            }
                            sampleToTimepoint[sampleName] = time;
                            columnIndexToTimepoint.Add(time);
                        }
                    }

                    //read in the intensities
                    for (int i = 1; i < lines.Length; i++)
                    {
                        string[]      line     = lines[i].Split('\t');
                        string        sequence = line[0];
                        string        protein  = line[0];
                        List <double> timepointsForThisPeptide  = new List <double>();
                        List <double> rfValuesForThisPeptide    = new List <double>();
                        List <string> filenamesForThisPeptide   = new List <string>();
                        List <double> intensitiesForThisPeptide = new List <double>();
                        for (int column = firstRatioIndex; column < firstRatioIndex + columnIndexToSampleName.Count; column++)
                        {
                            if (!line[column].Equals("NaN"))
                            {
                                int indexLookup = (column - firstRatioIndex);
                                timepointsForThisPeptide.Add(columnIndexToTimepoint[indexLookup]);

                                double ratio = Convert.ToDouble(line[column]);
                                rfValuesForThisPeptide.Add(1 - ratio / (ratio + 1)); //convert H/L to L/Total //TODO remove the "1-" for normal experiments before release
                                filenamesForThisPeptide.Add(columnIndexToSampleName[indexLookup]);
                                intensitiesForThisPeptide.Add(Convert.ToDouble(line[(column - firstRatioIndex) + firstIntensityIndex]));
                            }
                        }
                        //remove timepoints with too little data
                        foreach (double timepoint in timepointToSamples.Keys)
                        {
                            List <int> indicesForThisTimepoint = new List <int>();
                            for (int index = 0; index < timepointsForThisPeptide.Count; index++)
                            {
                                if (timepointsForThisPeptide[index].Equals(timepoint))
                                {
                                    indicesForThisTimepoint.Add(index);
                                }
                            }
                            if (indicesForThisTimepoint.Count < settings.MinValidValuesPerTimepoint)
                            {
                                for (int index = indicesForThisTimepoint.Count - 1; index >= 0; index--)
                                {
                                    int actualIndex = indicesForThisTimepoint[index];
                                    timepointsForThisPeptide.RemoveAt(actualIndex);
                                    rfValuesForThisPeptide.RemoveAt(actualIndex);
                                    filenamesForThisPeptide.RemoveAt(actualIndex);
                                    intensitiesForThisPeptide.RemoveAt(actualIndex);
                                }
                            }
                        }

                        if (timepointsForThisPeptide.Count >= settings.MinValidValuesTotal)
                        {
                            peptides.Add(new PeptideTurnoverObject(sequence, timepointsForThisPeptide.ToArray(), rfValuesForThisPeptide.ToArray(),
                                                                   filenamesForThisPeptide.ToArray(), intensitiesForThisPeptide.ToArray(), intensitiesForThisPeptide.Sum(), file, protein));
                        }
                    }
                }
            }

            //create a hash table for quick lookups
            //the idea is to break up the proteins into k-mers (must be less than shortest peptide length) and look up the starts of those

            Dictionary <Protein, Dictionary <string, List <int> > > theoreticalProteinLookupTable = new Dictionary <Protein, Dictionary <string, List <int> > >();
            const int kMerLength = 6;

            foreach (Protein p in theoreticalProteins)
            {
                string baseSequence = p.BaseSequence;
                Dictionary <string, List <int> > lookupTable = new Dictionary <string, List <int> >();
                for (int i = 0; i < baseSequence.Length - kMerLength + 1; i++)
                {
                    string kmer = baseSequence.Substring(i, kMerLength);
                    if (lookupTable.TryGetValue(kmer, out var value))
                    {
                        value.Add(i);
                    }
                    else
                    {
                        lookupTable.Add(kmer, new List <int> {
                            i
                        });
                    }
                }
                if (!theoreticalProteinLookupTable.Keys.Contains(p))
                {
                    theoreticalProteinLookupTable.Add(p, lookupTable);
                }
            }

            //lookup the sequence in the database
            //sort
            peptides = peptides.Where(x => x.RelativeFractions.Length != 0).OrderBy(x => x.BaseSequence).ToList();

            //do reverse parsimony
            int[] threads = Enumerable.Range(0, Environment.ProcessorCount).ToArray();
            //int[] threads = Enumerable.Range(0, 1).ToArray();
            Parallel.ForEach(threads, (thread) =>
            {
                string mostRecentBaseSequence = "";
                int mostRecentIndex           = -1;
                int max = (thread + 1) * peptides.Count / threads.Length;
                for (int i = thread * peptides.Count / threads.Length; i < max; i++)
                {
                    PeptideTurnoverObject currentPeptide = peptides[i];
                    //if same base seq, just reuse the old info
                    if (!currentPeptide.BaseSequence.Equals(mostRecentBaseSequence))
                    {
                        mostRecentBaseSequence = currentPeptide.BaseSequence;

                        //find proteins containing this sequence
                        //List<Protein> proteinsContainingThisSeq = theoreticalProteins.Where(x => x.BaseSequence.Contains(mostRecentBaseSequence)).OrderBy(x => x.Accession).ToList();
                        List <Protein> proteinsContainingThisSeq = new List <Protein>();

                        string kMer = mostRecentBaseSequence.Substring(0, kMerLength);
                        foreach (Protein p in theoreticalProteins)
                        {
                            if (theoreticalProteinLookupTable[p].TryGetValue(kMer, out var indices))
                            {
                                if (NativeStringSearch(p.BaseSequence, mostRecentBaseSequence, indices, kMer))
                                {
                                    proteinsContainingThisSeq.Add(p);
                                }
                            }
                        }
                        if (proteinsContainingThisSeq.Count == 0)
                        {
                            throw new Exception("Database was missing the protein: " + currentPeptide.Protein + " or the given protein did not contain the sequence: " + mostRecentBaseSequence);
                        }
                        string protein = proteinsContainingThisSeq[0].Accession;
                        for (int index = 1; index < proteinsContainingThisSeq.Count; index++)
                        {
                            protein += ";" + proteinsContainingThisSeq[index].Accession;
                        }
                        if (!protein.Equals(currentPeptide.Protein))
                        { //why aren't they the same? Parsimony
                            currentPeptide.UpdateProteinFromParsimony(protein);
                        }

                        if (proteinsContainingThisSeq.Count > 1)
                        {
                        }   //TODO: The handling of this is sloppy, but complicated to deal with the right way.
                            //It's assuming that proteoforms will only appear by being on the same peptide (modified/unmodified) and not through overlapping peptides
                            //find index of base sequence
                        mostRecentIndex = proteinsContainingThisSeq.First().BaseSequence.IndexOf(mostRecentBaseSequence);
                    }
                    currentPeptide.StartResidue = mostRecentIndex;
                    currentPeptide.EndResidue   = mostRecentIndex + mostRecentBaseSequence.Length;
                    //if there are mods
                    if (mostRecentBaseSequence.Length != currentPeptide.FullSequence.Length)
                    {
                        string fullSequence = currentPeptide.FullSequence;
                        int currentIndex    = 0;
                        for (int index = 0; index < fullSequence.Length; index++)
                        {
                            //if there's a mod
                            if (fullSequence[index] == '[')
                            {
                                int bracketCount = 1;
                                string mod       = "";
                                index++;
                                while (bracketCount != 0)
                                {
                                    if (fullSequence[index] == '[')
                                    {
                                        bracketCount++;
                                    }
                                    else if (fullSequence[index] == ']')
                                    {
                                        bracketCount--;
                                        index--;
                                    }
                                    if (bracketCount != 0)
                                    {
                                        mod += fullSequence[index];
                                    }
                                    index++;
                                }
                                if (currentIndex == 1 && currentPeptide.ModDictionary.ContainsKey(0))
                                {
                                    currentPeptide.ModDictionary[0] = currentPeptide.ModDictionary[0] + "+" + mod;
                                }
                                else
                                {
                                    currentPeptide.ModDictionary.Add(currentIndex == 0 ? 0 : currentIndex - 1, mod); //N-terminal mods are counted as being on the first residue
                                }
                            }
                            else
                            {
                                currentIndex++;
                            }
                        }
                    }
                }
            });

            //have all peptides, now convert into proteins
            List <PeptideTurnoverObject> possibleProteoformGroups = new List <PeptideTurnoverObject>();
            var peptidesGroupedByProtein = peptides.GroupBy(x => x.Protein, x => x).ToList();

            //TODO parallelize this, break out into separate method, unit tests
            //foreach protein group
            for (int i = 0; i < peptidesGroupedByProtein.Count; i++)
            {
                var    group   = peptidesGroupedByProtein[i];
                string protein = group.Key;
                //get the peptides
                List <PeptideTurnoverObject>     peptidesForThisProtein = group.OrderBy(x => x.StartResidue).ToList();
                Dictionary <int, List <string> > mods = new Dictionary <int, List <string> >();
                foreach (PeptideTurnoverObject peptide in peptidesForThisProtein)
                {
                    for (int residue = peptide.StartResidue; residue < peptide.EndResidue; residue++)
                    {
                        //is residue start/end correct?
                        string value = UnmodifiedString;
                        if (peptide.ModDictionary.ContainsKey(residue - peptide.StartResidue)) //the mod dictionary is zero-based.
                        {
                            value = peptide.ModDictionary[residue - peptide.StartResidue];
                        }
                        if (mods.ContainsKey(residue))
                        {
                            if (!mods[residue].Contains(value))
                            {
                                mods[residue].Add(value);
                            }
                        }
                        else
                        {
                            mods.Add(residue, new List <string> {
                                value
                            });
                        }
                    }
                }

                //find which residues have multiple forms
                var    residueDifferences = mods.Where(x => x.Value.Count > 1).OrderBy(x => x.Key).ToList();
                bool[] uniquePeptides     = new bool[peptidesForThisProtein.Count]; //have we added this peptide already? At the start, none have been added.
                if (residueDifferences.Count != 0)
                {
                    //foreach residue with a different form
                    foreach (var residueDifference in residueDifferences)
                    {
                        int residueIndex = residueDifference.Key; //get the residue

                        //find the relevant peptides for this residue
                        bool foundResidue = false;
                        for (int index = 0; index < peptidesForThisProtein.Count; index++)
                        {
                            PeptideTurnoverObject peptide = peptidesForThisProtein[index];
                            //if this peptide is relevant as belonging to a proteoform group
                            if (peptide.StartResidue <= residueIndex && peptide.EndResidue >= residueIndex)
                            {//is this right?
                                uniquePeptides[index] = true;
                                foundResidue          = true;
                                string modForThisPeptide = UnmodifiedString;
                                if (peptide.ModDictionary.ContainsKey(residueIndex - peptide.StartResidue))
                                {
                                    modForThisPeptide = peptide.ModDictionary[residueIndex - peptide.StartResidue];
                                }
                                for (int indexForThisMod = 0; indexForThisMod < residueDifference.Value.Count; indexForThisMod++)
                                {
                                    if (residueDifference.Value[indexForThisMod].Equals(modForThisPeptide))
                                    {
                                        possibleProteoformGroups.Add(peptide.Copy(peptide.Protein + "_" + modForThisPeptide + "@" + residueDifference.Key.ToString()));
                                        break;
                                    }
                                }
                            }
                            //if we're no longer looking at relevant peptides
                            else if (foundResidue)
                            {
                                break;
                            }
                        }
                    }
                }

                //add peptides that aren't part of proteoform groups
                for (int index = 0; index < peptidesForThisProtein.Count; index++)
                {
                    if (!uniquePeptides[index])
                    {
                        possibleProteoformGroups.Add(peptidesForThisProtein[index]);
                    }
                }
            }

            return(possibleProteoformGroups); //.Where(x => x.Timepoints.Length >= settings.MinValidValuesTotal).OrderByDescending(x => x.Timepoints.Length).ThenByDescending(x => x.TotalIntensity).ToList();
        }
Пример #7
0
        private void PlotPeptideData(List <PeptideTurnoverObject> peptidesToPlot)
        {
            RatioComparisonPlot.plt.Clear();
            RatioComparisonPlot.plt.Legend(false);
            HalfLifeComparisonPlot.plt.GetPlottables().Clear();
            //if (PlotAminoAcidPoolCheckBox.IsChecked.Value)
            //{
            //    foreach (string file in FilesToDisplayObservableCollection)
            //    {
            //        PlotFit(PoolParameterDictionary[file], Path.GetFileNameWithoutExtension(file)+" Free Amino Acids");
            //    }
            //}

            double minError    = double.PositiveInfinity;
            double maxError    = double.NegativeInfinity;
            double minHalfLife = double.PositiveInfinity;
            double maxHalfLife = double.NegativeInfinity;
            int    debug       = 0;

            foreach (PeptideTurnoverObject peptide in peptidesToPlot)
            {
                debug++;
                //get the title
                int fontSize = Math.Max(Math.Min(24, 100 / (int)Math.Round(Math.Sqrt(peptide.DisplayPeptideSequence.Length))), 12);
                RatioComparisonPlot.plt.Title(peptide.DisplayPeptideSequence, fontSize: fontSize);

                string protein  = peptide.DisplayProteinOrProteoform;
                string filepath = peptide.FileName;
                string filename = Path.GetFileNameWithoutExtension(filepath);
                //plot actual data
                RatioComparisonPlot.plt.PlotScatter(peptide.Timepoints, peptide.RelativeFractions, markerSize: 4, lineWidth: 0, label: filename + " Observed Ratios");

                //Plot protein info
                List <PeptideTurnoverObject> peptidesSharingProteinAndFile = PeptidesToDisplay.Where(x => x.DisplayProteinOrProteoform.Equals(protein) && x.FileName.Equals(filepath)).ToList();
                double[] errors         = peptidesSharingProteinAndFile.Select(x => x.Error).ToArray();
                double[] halfLives      = peptidesSharingProteinAndFile.Select(x => Math.Log(2, Math.E) / x.Kbi).ToArray();
                double[] negativeErrors = peptidesSharingProteinAndFile.Select(x => (Math.Log(2, Math.E) / x.Kbi) - (Math.Log(2, Math.E) / x.HighKbi)).ToArray();
                double[] positiveErrors = peptidesSharingProteinAndFile.Select(x => (Math.Log(2, Math.E) / x.LowKbi) - (Math.Log(2, Math.E) / x.Kbi)).ToArray();

                HalfLifeComparisonPlot.plt.Title(protein, fontSize: 24);
                HalfLifeComparisonPlot.plt.Layout(titleHeight: 20, xLabelHeight: 40, y2LabelWidth: 20);
                HalfLifeComparisonPlot.plt.YLabel("Half-life (Days)", fontSize: 20);
                HalfLifeComparisonPlot.plt.XLabel("Error (MSE)", fontSize: 20);
                HalfLifeComparisonPlot.plt.Ticks(fontSize: 18);

                double errorDiff = errors.Max() - errors.Min();
                if (errorDiff == 0)
                {
                    errorDiff = 0.01;
                }
                double halflifeDiff = halfLives.Max() - halfLives.Min();
                if (halflifeDiff == 0)
                {
                    halflifeDiff = 0.01;
                }
                var scatter = HalfLifeComparisonPlot.plt.PlotScatter(errors, halfLives, lineWidth: 0, label: filename + " peptides", color: Color.SteelBlue);//debug == 1 ? Color.DodgerBlue : Color.Red);
                //plot the single point of the selected peptie separately (overlay) so that we know which one it is
                var point = HalfLifeComparisonPlot.plt.PlotPoint(peptide.Error, Math.Log(2, Math.E) / peptide.Kbi, color: Color.Black);
                //plot errors
                HalfLifeComparisonPlot.plt.PlotErrorBars(errors, halfLives, null, null, positiveErrors, negativeErrors, scatter.color);
                HalfLifeComparisonPlot.plt.PlotErrorBars(new double[] { peptide.Error }, new double[] { Math.Log(2, Math.E) / peptide.Kbi },
                                                         null, null, new double[] { (Math.Log(2, Math.E) / peptide.LowKbi) - (Math.Log(2, Math.E) / peptide.Kbi) },
                                                         new double[] { (Math.Log(2, Math.E) / peptide.Kbi) - (Math.Log(2, Math.E) / peptide.HighKbi) }, color: point.color);

                minError    = Math.Min(minError, errors.Min() - errorDiff * 0.2);
                maxError    = Math.Max(maxError, errors.Max() + errorDiff * 0.2);
                minHalfLife = Math.Min(minHalfLife, halfLives.Min() - negativeErrors.Max() - halflifeDiff * 0.2);
                maxHalfLife = Math.Max(maxHalfLife, halfLives.Max() + positiveErrors.Max() + halflifeDiff * 0.2);

                double ySpacingFactor = (maxHalfLife - minHalfLife) * 0.2;
                HalfLifeComparisonPlot.plt.Axis(minError, maxError, minHalfLife - ySpacingFactor, maxHalfLife + ySpacingFactor);
                HalfLifeComparisonPlot.plt.Axis();

                PeptideTurnoverObject currentProtein = DisplayProteinInSpecificTable ?
                                                       AnalyzedProteins.Where(x => x.Protein.Equals(protein) && x.FileName.Equals(filepath)).FirstOrDefault() :
                                                       AnalyzedProteoforms.Where(x => x.Proteoform.Equals(protein) && x.FileName.Equals(filepath)).FirstOrDefault();

                if (currentProtein == null)
                {
                    MessageBox.Show("Unable to find the protein for this peptide. There may be an issue with the loaded file.");
                    return;
                }

                //plot the fit
                if (PlotBestFitCheckBox.IsChecked.Value)
                {
                    //peptide level
                    PlotFit(PoolParameterDictionary[filepath], filename + " Fit (" + (Math.Log(2, Math.E) / peptide.Kbi).ToString("F1") + " d)", peptide.Kbi);
                    //protein level
                    double halfLife = Math.Log(2, Math.E) / currentProtein.Kbi;
                    HalfLifeComparisonPlot.plt.PlotHLine(halfLife, label: filename + " Half-life (" + halfLife.ToString("F1") + ")", color: Color.OrangeRed);//debug == 1 ? Color.DodgerBlue : Color.Red);
                }
                //plt the confidence intervals
                if (PlotCICheckBox.IsChecked.Value)
                {
                    //peptide level
                    PlotFit(PoolParameterDictionary[filepath], filename + " Upper CI (" + (Math.Log(2, Math.E) / peptide.LowKbi).ToString("F1") + " d)", peptide.LowKbi);
                    PlotFit(PoolParameterDictionary[filepath], filename + " Lower CI (" + (Math.Log(2, Math.E) / peptide.HighKbi).ToString("F1") + " d)", peptide.HighKbi);
                    //protein level
                    double upperHL = Math.Log(2, Math.E) / currentProtein.LowKbi;
                    double lowerHL = Math.Log(2, Math.E) / currentProtein.HighKbi;
                    HalfLifeComparisonPlot.plt.PlotHLine(upperHL, label: filename + " Upper CI (" + upperHL.ToString("F1") + ")", color: Color.Green);
                    HalfLifeComparisonPlot.plt.PlotHLine(lowerHL, label: filename + " Lower CI (" + lowerHL.ToString("F1") + ")", color: Color.Red);
                }
            }
            if (PlotAminoAcidPoolCheckBox.IsChecked.Value)
            {
                foreach (string file in FilesToDisplayObservableCollection)
                {
                    PlotFit(PoolParameterDictionary[file], Path.GetFileNameWithoutExtension(file) + " Free Amino Acids");
                }
            }

            if (DisplayLegendCheckBox.IsChecked.Value)
            {
                HalfLifeComparisonPlot.plt.Legend();
            }
            else
            {
                HalfLifeComparisonPlot.plt.Legend(false);
            }
            HalfLifeComparisonPlot.Render();
        }
Пример #8
0
        public static void CompareProteinsAcrossFiles(List <string> filenames, List <PeptideTurnoverObject> allProteins, Dictionary <string, PoolParameters> poolParameterDictionary)
        {
            if (filenames.Count < 2)
            {
                return;
            }
            string outputDirectory = Path.GetDirectoryName(filenames.First());

            Directory.CreateDirectory(Path.Combine(outputDirectory, "StatisticalComparisons"));
            for (int i = 0; i < filenames.Count; i++)
            {
                string         fileOne   = filenames[i];
                PoolParameters paramsOne = poolParameterDictionary[fileOne];
                //get the proteins for this file
                List <PeptideTurnoverObject> proteinsForFileOne = allProteins.Where(x => fileOne.Equals(x.FileName)).OrderBy(x => x.Protein).ToList();

                for (int j = i + 1; j < filenames.Count; j++)
                {
                    string         fileTwo      = filenames[j];
                    PoolParameters paramsTwo    = poolParameterDictionary[fileTwo];
                    List <string>  linesToWrite = new List <string>();
                    //add header
                    linesToWrite.Add("Protein\tFold Change\tNeg. log(p-Value)\tHalf-life " + fileOne + "\tHalf-life " + fileTwo);

                    List <PeptideTurnoverObject> proteinsForFileTwo = allProteins.Where(x => fileTwo.Equals(x.FileName)).OrderBy(x => x.Protein).ToList();

                    //get the overlap between them
                    int a = 0;
                    int b = 0;
                    while (a < proteinsForFileOne.Count && b < proteinsForFileTwo.Count)
                    {
                        PeptideTurnoverObject proteinOne = proteinsForFileOne[a];
                        PeptideTurnoverObject proteinTwo = proteinsForFileTwo[b];
                        int comparison = (proteinOne.Protein).CompareTo(proteinTwo.Protein);
                        if (comparison == 0)
                        {
                            //do the comparison (t-test of montecarlos, which dramatically overestimates the sample size)
                            //Sample sampleOne = new Sample(proteinOne.MonteCarloKbis.Select(x => Math.Log10(2) / x));
                            //Sample sampleTwo = new Sample(proteinTwo.MonteCarloKbis.Select(x => Math.Log10(2) / x));
                            //TestResult result = Sample.StudentTTest(sampleOne, sampleTwo);
                            //linesToWrite.Add(proteinOne.Protein + "\t" + (Math.Log2(sampleTwo.Median) - Math.Log2(sampleOne.Median)).ToString() + '\t' +
                            //    (-1*Math.Log10(result.Probability)).ToString() + '\t' + (Math.Log10(2) / proteinOne.Kbi).ToString() + '\t' + (Math.Log10(2) / proteinTwo.Kbi).ToString());

                            //do the comparison (t-test of normalized ratios for all timepoints)
                            double averageKbi         = (proteinOne.Kbi + proteinTwo.Kbi) / 2;
                            double normalizedHalfLife = Math.Log(2) / (averageKbi); //this is the day we're going to normalize all of the relative fractions to

                            //create an array of a single value (the normalized timepoint) to create a new timepoint array
                            double[] comparisonTimepointsOne = new double[proteinOne.Timepoints.Length];
                            double[] comparisonTimepointsTwo = new double[proteinTwo.Timepoints.Length];
                            for (int index = 0; index < comparisonTimepointsOne.Length; index++)
                            {
                                comparisonTimepointsOne[index] = normalizedHalfLife;
                            }
                            for (int index = 0; index < comparisonTimepointsTwo.Length; index++)
                            {
                                comparisonTimepointsTwo[index] = normalizedHalfLife;
                            }

                            //predict the expected values for the ratios of protein one based on the fit of the comparison
                            double[] expectedOriginalRatiosOne = NonLinearRegression.PredictRelativeFractionUsingThreeCompartmentModel(paramsOne.Kst, paramsOne.Kbt, paramsOne.Kao, averageKbi, proteinOne.Timepoints);
                            //predict the expected values for the ratios of proteoform one based on the fit of the comparison if they were all at the same normalized timepoint
                            double[] expectedUpdatedRatiosOne = NonLinearRegression.PredictRelativeFractionUsingThreeCompartmentModel(paramsOne.Kst, paramsOne.Kbt, paramsOne.Kao, averageKbi, comparisonTimepointsOne);

                            //do the same thing with protein two
                            double[] expectedOriginalRatiosTwo = NonLinearRegression.PredictRelativeFractionUsingThreeCompartmentModel(paramsTwo.Kst, paramsTwo.Kbt, paramsTwo.Kao, averageKbi, proteinTwo.Timepoints);
                            double[] expectedUpdatedRatiosTwo  = NonLinearRegression.PredictRelativeFractionUsingThreeCompartmentModel(paramsTwo.Kst, paramsTwo.Kbt, paramsTwo.Kao, averageKbi, comparisonTimepointsTwo);

                            //create empty arrays for the normalized ratios
                            double[] normalizedRatiosOne = new double[expectedOriginalRatiosOne.Length];
                            double[] normalizedRatiosTwo = new double[expectedOriginalRatiosTwo.Length];

                            //calculate the normalized ratios by subtracting the expected ratio (so that we are measuring the residual between the point and the comparison fit) and then adding the ratio of the comparison fit at the normalized timepoint.
                            for (int index = 0; index < proteinOne.RelativeFractions.Length; index++)
                            {
                                //the normalized ratio is equal to the original ratio minus the original fit to the data plus the fit if the kbi was averaged
                                normalizedRatiosOne[index] = proteinOne.RelativeFractions[index] - expectedOriginalRatiosOne[index] + expectedUpdatedRatiosOne[index];
                            }
                            for (int index = 0; index < proteinTwo.RelativeFractions.Length; index++)
                            {
                                normalizedRatiosTwo[index] = proteinTwo.RelativeFractions[index] - expectedOriginalRatiosTwo[index] + expectedUpdatedRatiosTwo[index];
                            }
                            Sample     sampleOne = new Sample(normalizedRatiosOne);
                            Sample     sampleTwo = new Sample(normalizedRatiosTwo);
                            TestResult result    = Sample.StudentTTest(sampleOne, sampleTwo);
                            linesToWrite.Add(proteinOne.Protein + "\t" + (Math.Log2(Math.Log(2) / proteinTwo.Kbi) - Math.Log2(Math.Log(2) / proteinOne.Kbi)).ToString() + '\t' +
                                             (-1 * Math.Log10(result.Probability)).ToString() + '\t' + (Math.Log(2) / proteinOne.Kbi).ToString() + '\t' + (Math.Log(2) / proteinTwo.Kbi).ToString());

                            a++;
                            b++;
                        }
                        else if (comparison < 0)
                        {
                            a++;
                        }
                        else
                        {
                            b++;
                        }
                    }
                    File.WriteAllLines(Path.Combine(outputDirectory, "StatisticalComparisons", "Comparison_" + Path.GetFileNameWithoutExtension(fileOne) + "vs" + Path.GetFileNameWithoutExtension(fileTwo) + ".tsv"), linesToWrite);
                }
            }
        }
Пример #9
0
        public static void CompareProteoformsWithinFiles(List <string> filenames, List <PeptideTurnoverObject> allProteins, Dictionary <string, PoolParameters> poolParameterDictionary)
        {
            for (int fileIndex = 0; fileIndex < filenames.Count; fileIndex++)
            {
                string         filename   = filenames[fileIndex];
                PoolParameters poolParams = poolParameterDictionary[filename];
                List <PeptideTurnoverObject> proteinsForThisFile = allProteins.Where(x => filename.Equals(x.FileName)).OrderBy(x => x.Proteoform).ToList();
                List <string> linesToWrite = new List <string>();
                linesToWrite.Add("Proteoform A\tProteoform B\tHalf-life A\tHalf-life B\tLog2(Fold Change)\tNeg. log(p-Value)");

                int indexOfNextProteoformFamily = 0;
                for (int i = 0; i < proteinsForThisFile.Count; i++)
                {
                    string currentProtein = proteinsForThisFile[i].Proteoform.Split('_')[0];

                    //find last index for this proteoform family
                    indexOfNextProteoformFamily++;
                    for (; indexOfNextProteoformFamily < proteinsForThisFile.Count; indexOfNextProteoformFamily++)
                    {
                        if (!currentProtein.Equals(proteinsForThisFile[indexOfNextProteoformFamily].Proteoform.Split('_')[0]))
                        {
                            break;
                        }
                    }

                    for (; i < indexOfNextProteoformFamily; i++)
                    {
                        PeptideTurnoverObject proteinOne = proteinsForThisFile[i];

                        //see if it has a localized mod (or localized unmodified site), otherwise skip
                        string[] proteoformOne = proteinOne.Proteoform.Split('@').ToArray();
                        if (proteoformOne.Length == 2)
                        {
                            for (int j = i + 1; j < indexOfNextProteoformFamily; j++)
                            {
                                PeptideTurnoverObject proteinTwo = proteinsForThisFile[j];
                                string[] proteoformTwo           = proteinTwo.Proteoform.Split('@').ToArray();

                                //if these are a pair for the same modification site, then do the comparison
                                if (proteoformTwo.Length == 2 && proteoformOne[1].Equals(proteoformTwo[1]))
                                {
                                    //do the comparison (t-test of normalized ratios for all timepoints)
                                    double averageKbi         = (proteinOne.Kbi + proteinTwo.Kbi) / 2;
                                    double normalizedHalfLife = Math.Log(2) / (averageKbi); //this is the day we're going to normalize all of the relative fractions to

                                    //create an array of a single value (the normalized timepoint) to create a new timepoint array
                                    double[] comparisonTimepointsOne = new double[proteinOne.Timepoints.Length];
                                    double[] comparisonTimepointsTwo = new double[proteinTwo.Timepoints.Length];
                                    for (int index = 0; index < comparisonTimepointsOne.Length; index++)
                                    {
                                        comparisonTimepointsOne[index] = normalizedHalfLife;
                                    }
                                    for (int index = 0; index < comparisonTimepointsTwo.Length; index++)
                                    {
                                        comparisonTimepointsTwo[index] = normalizedHalfLife;
                                    }

                                    double[] expectedOriginalRatiosOne = NonLinearRegression.PredictRelativeFractionUsingThreeCompartmentModel(poolParams.Kst, poolParams.Kbt, poolParams.Kao, averageKbi, proteinOne.Timepoints);
                                    double[] expectedUpdatedRatiosOne  = NonLinearRegression.PredictRelativeFractionUsingThreeCompartmentModel(poolParams.Kst, poolParams.Kbt, poolParams.Kao, averageKbi, comparisonTimepointsOne);
                                    double[] expectedOriginalRatiosTwo = NonLinearRegression.PredictRelativeFractionUsingThreeCompartmentModel(poolParams.Kst, poolParams.Kbt, poolParams.Kao, averageKbi, proteinTwo.Timepoints);
                                    double[] expectedUpdatedRatiosTwo  = NonLinearRegression.PredictRelativeFractionUsingThreeCompartmentModel(poolParams.Kst, poolParams.Kbt, poolParams.Kao, averageKbi, comparisonTimepointsTwo);
                                    double[] normalizedRatiosOne       = new double[expectedOriginalRatiosOne.Length];
                                    double[] normalizedRatiosTwo       = new double[expectedOriginalRatiosTwo.Length];
                                    for (int index = 0; index < proteinOne.RelativeFractions.Length; index++)
                                    {
                                        //the normalized ratio is equal to the original ratio minus the original fit to the data plus the fit if the kbi was averaged
                                        normalizedRatiosOne[index] = proteinOne.RelativeFractions[index] - expectedOriginalRatiosOne[index] + expectedUpdatedRatiosOne[index];
                                    }
                                    for (int index = 0; index < proteinTwo.RelativeFractions.Length; index++)
                                    {
                                        normalizedRatiosTwo[index] = proteinTwo.RelativeFractions[index] - expectedOriginalRatiosTwo[index] + expectedUpdatedRatiosTwo[index];
                                    }
                                    Sample     sampleOne = new Sample(normalizedRatiosOne);
                                    Sample     sampleTwo = new Sample(normalizedRatiosTwo);
                                    TestResult result    = Sample.StudentTTest(sampleOne, sampleTwo);

                                    try //sometimes crashes if stdev is zero
                                    {
                                        linesToWrite.Add(proteinOne.Proteoform + "\t" + proteinTwo.Proteoform + '\t' + (Math.Log(2) / proteinOne.Kbi).ToString() + '\t' + (Math.Log(2) / proteinTwo.Kbi).ToString() + '\t' +
                                                         (Math.Log2((Math.Log(2) / proteinTwo.Kbi)) - Math.Log2((Math.Log(2) / proteinOne.Kbi))).ToString() + '\t' + (-1 * Math.Log(result.Probability)).ToString());
                                    }
                                    catch
                                    {
                                        linesToWrite.Add(proteinOne.Proteoform + "\t" + proteinTwo.Proteoform + '\t' + (Math.Log(2) / proteinOne.Kbi).ToString() + '\t' + (Math.Log(2) / proteinTwo.Kbi).ToString() + '\t' +
                                                         (Math.Log2(sampleTwo.Median) - Math.Log2(sampleOne.Median)).ToString() + '\t' + "NA");
                                    }
                                }
                            }
                        }
                    }
                    i--;
                }

                File.WriteAllLines(Path.Combine(Path.GetDirectoryName(filename), Path.GetFileNameWithoutExtension(filename) + "_Results", Path.GetFileNameWithoutExtension(filename) + "_ProteoformAnalysis.tsv"), linesToWrite);
            }
        }