/// <summary> /// Determines which proteins are significant based on N Value Threshold and prints out the classifications /// </summary> private void PrintSignificantProtein(List <ProteinRowInfo> allProteinInfo, List <double> actualNValues, double nValueThreshold, List <double> actualPValues, List <double> actualLogFoldChange, string printSignificantProteinsLocation) { using (StreamWriter writetext = new StreamWriter(printSignificantProteinsLocation)) { for (int i = 0; i < actualNValues.Count; i++) { if (i == 0) { writetext.Write("ProteinID" + ", " + "PValue" + ", " + "LogFoldChange" + ", " + "Significance"); writetext.WriteLine(); } ProteinRowInfo proteinRowInfo = allProteinInfo[i]; if (actualNValues[i] < nValueThreshold) { writetext.Write(proteinRowInfo.ProteinID + ", " + actualPValues[i] + ", " + actualLogFoldChange[i] + ", " + "Not Significant"); writetext.WriteLine(); } else { writetext.Write(proteinRowInfo.ProteinID + ", " + actualPValues[i] + ", " + actualLogFoldChange[i] + ", " + "Significant"); writetext.WriteLine(); } } } }
/// <summary> /// Imputes missing intensity value for each protein /// </summary> private void ImputeData(ProteinRowInfo proteinRowInfo, double[] samplesMeanIntensityValue, double[] samplesStandardDeviation, List <string> samplesFileNames, double[] missingFactor, int[] numberOfIntensityValuesInSample, double meanFraction) { Dictionary <string, double> samplesintensityData = proteinRowInfo.SamplesIntensityData; for (int i = 0; i < samplesFileNames.Count; i++) { if (samplesintensityData[samplesFileNames[i]] == 0) { double imputedFraction = missingFactor[i] / (numberOfIntensityValuesInSample[i] + missingFactor[i]); if (imputedFraction <= 0.5) { double imputedProbability = imputedFraction / (1 - imputedFraction); double standardDeviationFraction = Math.Max(2 * imputedFraction, 0.3); double stdDevFraction = 0.6 * (1 - (imputedFraction * imputedFraction)); Normal probabilityDist = new Normal(samplesMeanIntensityValue[i], standardDeviationFraction); double probabilitySetPoint = probabilityDist.Density(samplesMeanIntensityValue[i] + stdDevFraction * standardDeviationFraction); double yCoordinate = imputedProbability * probabilitySetPoint; double deltaX = standardDeviationFraction * stdDevFraction; Normal xCoord = new Normal(samplesMeanIntensityValue[i], samplesStandardDeviation[i]); double deltaMu = xCoord.InverseCumulativeDistribution(yCoordinate); double meanDownshift = (deltaMu - deltaX * meanFraction); Normal normalDist = new Normal(meanDownshift, standardDeviationFraction); double imputeVal = normalDist.Sample(); samplesintensityData[samplesFileNames[i]] = imputeVal; } } } }
/// <summary> /// Detects protein's missing intensity values and increase the number of such missing values found for each sample /// </summary> public void CalculateMissingIntensityValuesInSample(ProteinRowInfo proteinRowInfo, double[] missingFactor, List <string> samplesFileNames) { Dictionary <string, double> samplesintensityData = proteinRowInfo.SamplesIntensityData; for (int i = 0; i < samplesFileNames.Count; i++) { // detected protein's sample with missing intensity value if (samplesintensityData[samplesFileNames[i]] == 0) { // increased missing intensity values counter for the sample missingFactor[i]++; } } }
/// <summary> /// Computes all required data and then imputes missing intensity values of protein /// </summary> public void RunImputationProcess(List <ProteinRowInfo> allProteinInfo, List <string> samplesFileNames, double meanFraction) { double[] samplesAllIntensityValuesSum = new double[samplesFileNames.Count]; double[] samplesMeanIntensityValue = new double[samplesFileNames.Count]; double[] samplesStandardDeviationNumerators = new double[samplesFileNames.Count]; double[] samplesStandardDeviation = new double[samplesFileNames.Count]; double[] missingFactor = new double[samplesFileNames.Count]; // MIGHT HAVE TO REMOVE int[] numberOfIntensityValuesInSample = new int[samplesFileNames.Count]; for (int i = 0; i < allProteinInfo.Count; i++) { ProteinRowInfo proteinRowInfo = allProteinInfo[i]; CalculateNumberAndSumOfIntensityValuesOfSample(proteinRowInfo, samplesAllIntensityValuesSum, samplesFileNames, numberOfIntensityValuesInSample); } for (int i = 0; i < samplesAllIntensityValuesSum.Length; i++) { // computes the mean of intensity values for each sample samplesMeanIntensityValue[i] = samplesAllIntensityValuesSum[i] / numberOfIntensityValuesInSample[i]; } for (int i = 0; i < allProteinInfo.Count; i++) { ProteinRowInfo proteinRowInfo = allProteinInfo[i]; CalculateSampleStandardDeviationNumerator(proteinRowInfo, samplesStandardDeviationNumerators, samplesFileNames, samplesMeanIntensityValue); } for (int i = 0; i < allProteinInfo.Count; i++) { ProteinRowInfo proteinRowInfo = allProteinInfo[i]; CalculateMissingIntensityValuesInSample(proteinRowInfo, missingFactor, samplesFileNames); } for (int i = 0; i < samplesStandardDeviationNumerators.Length; i++) { // computes the standard deviation of intensity values for each sample samplesStandardDeviation[i] = Math.Sqrt(samplesStandardDeviationNumerators[i] / (allProteinInfo.Count - missingFactor[i])); } for (int i = 0; i < allProteinInfo.Count; i++) { ProteinRowInfo proteinRowInfo = allProteinInfo[i]; ImputeData(proteinRowInfo, samplesMeanIntensityValue, samplesStandardDeviation, samplesFileNames, missingFactor, numberOfIntensityValuesInSample, meanFraction, false); } }
/// <summary> /// Uses protein intensity information to compute the numerator of standard deviation of all intensity values for each sample. /// </summary> public void CalculateSampleStandardDeviationNumerator(ProteinRowInfo proteinRowInfo, double[] samplesStandardDeviationNumerators, List <string> samplesFileNames, double[] samplesMeanIntensityValue) { Dictionary <string, double> samplesintensityData = proteinRowInfo.SamplesIntensityData; for (int i = 0; i < samplesFileNames.Count; i++) { if (samplesintensityData[samplesFileNames[i]] == 0) { continue; } // increases the numerator component for computing standard deviation of intensity values of sample samplesStandardDeviationNumerators[i] = samplesStandardDeviationNumerators[i] + (samplesintensityData[samplesFileNames[i]] - samplesMeanIntensityValue[i]) * (samplesintensityData[samplesFileNames[i]] - samplesMeanIntensityValue[i]); } }
/// <summary> /// Uses protein intensity information to add and eventually compute the sum of all intensity values for each sample. /// Similarly calculates total number of valid intensity values for each sample /// </summary> public void CalculateNumberAndSumOfIntensityValuesOfSample(ProteinRowInfo proteinRowInfo, double[] sampleAllIntensityValuesSum, List <string> samplesFileNames, int[] numberOfIntensityValuesInSample) { Dictionary <string, double> samplesintensityData = proteinRowInfo.SamplesIntensityData; for (int i = 0; i < samplesFileNames.Count; i++) { if (samplesintensityData[samplesFileNames[i]] == 0) { continue; } // increase the total intensity sum for the sample sampleAllIntensityValuesSum[i] = sampleAllIntensityValuesSum[i] + samplesintensityData[samplesFileNames[i]]; // increase number of valid intensity values for the sample numberOfIntensityValuesInSample[i]++; } }
/// <summary> /// Parses the QuantifiedProteins files outputted from FlashLFQ /// Filters out proteins with >="numNans" number of missing Intensity Values and generates a dictionary file /// mapping all conditions to their sampleFileName /// </summary> public void ProteinDataParser(List <ProteinRowInfo> allProteinInfo, int maxAllowedMissingValues, List <string> samplesFileNames, string quantifiedProteinFileLocation) { int startIntensityIndex = 0; int endIntensityIndex = 0; int proteinIDIndex = 0; //read in results string[] lines = File.ReadLines(quantifiedProteinFileLocation).ToArray(); //parse header string[] header = lines[0].Split('\t'); for (int i = 0; i < header.Length; i++) { if (header[i].StartsWith("Intensity")) { samplesFileNames.Add(header[i]); if (startIntensityIndex == 0) { startIntensityIndex = i; } endIntensityIndex = i; } if (header[i].StartsWith("Protein")) { proteinIDIndex = i; } } //parse data for (int l = 1; l < lines.Length; l++) { var proteinRow = lines[l].Split('\t'); // determine how many missing Intensity values the given protein has int numMissingValues = 0; for (int i = startIntensityIndex; i <= endIntensityIndex; i++) { if (string.IsNullOrWhiteSpace(proteinRow[i]) || Convert.ToDouble(proteinRow[i]) == 0) { numMissingValues++; } } //if there are enough valid values if (numMissingValues <= maxAllowedMissingValues) { // we know now that sample is valid so will be stored ProteinRowInfo proteinRowInfo = new ProteinRowInfo(); proteinRowInfo.ProteinID = proteinRow[proteinIDIndex]; // set protein ID int sampleFileNumberTracker = 0; // used to add the correct intensity value for (int i = startIntensityIndex; i <= endIntensityIndex; i++) { string writtenValue = proteinRow[i]; double intensity = string.IsNullOrWhiteSpace(writtenValue) || Convert.ToDouble(writtenValue) == 0 ? 0 : Math.Log(Convert.ToDouble(writtenValue), 2); proteinRowInfo.SamplesIntensityData[samplesFileNames[sampleFileNumberTracker]] = intensity; sampleFileNumberTracker++; } //proper row sample added to list allProteinInfo.Add(proteinRowInfo); } } }
public static void Main(string[] args) { RunProteinSignificanceClassifier proteinBasedSignificance = new RunProteinSignificanceClassifier(); int numberSamples = 0; // Parse the ExperimentalDesign File to get info of samples and conditions they belong to Dictionary <string, List <string> > samplefileConditionRelation = proteinBasedSignificance.ExpermientalDesignParser("C:/Users/Anay/Desktop/UW Madison/Smith Lab/Spectra Data/ExperimentalDesign.tsv" , ref numberSamples); // get all conditions and pair them up for Significance classification List <string> allConditions = new List <string>(samplefileConditionRelation.Keys); List <List <string> > allTwoConditionCombinations = proteinBasedSignificance.GenerateAllCombinationsOfTwoConditions(allConditions); foreach (List <string> conditionPair in allTwoConditionCombinations) { string firstCondition = conditionPair[0]; string secondCondition = conditionPair[1]; double sOValue = 0.1; double meanFraction = 0.1; int maxSignificantCount = 0; while (meanFraction < 1) { while (sOValue < 1) { for (int k = 0; k < numberSamples; k++) { proteinBasedSignificance = new RunProteinSignificanceClassifier(); //Declaring variables which will be generated after parsing QuantifiedPeptides file List <ProteinRowInfo> allProteinInfo = new List <ProteinRowInfo>(); List <string> samplesFileNames = new List <string>(); int maxInvalidIntensityValues = k; proteinBasedSignificance.ProteinDataParser(allProteinInfo, maxInvalidIntensityValues, samplesFileNames, "C:/Users/Anay/Desktop/UW Madison/Smith Lab/Spectra Data/FlashLFQ_2020-04-26-17-39-35/QuantifiedProteins.tsv"); // imputes missing intensity values for each protein ImputationProcess imputationProcess = new ImputationProcess(); imputationProcess.RunImputationProcess(allProteinInfo, samplesFileNames, meanFraction); // Declaring variables which will be generated after T-Tests and Permutation Tests List <double> observedNValues = new List <double>(); // will store observed N values List <double> permutedNValues = new List <double>(); // will store permuted N values StatisticalTests statisticalTests = new StatisticalTests(); // contains proteins and their observed N value, P Value and Log Fold Change Dictionary <string, List <double> > allProteinObservedStatistics = new Dictionary <string, List <double> >(); // Creating threads for Parallelizing code ThreadPool.GetMaxThreads(out int workerThreadsCount, out int ioThreadsCount); int[] threads = Enumerable.Range(0, workerThreadsCount).ToArray(); Parallel.ForEach(threads, (i) => { // Compute observed and permuted N Values for each protein using T Tests and Permutation Testing for (; i < allProteinInfo.Count; i += workerThreadsCount) { ProteinRowInfo proteinRowInfo = allProteinInfo[i]; Dictionary <string, double> samplesintensityData = proteinRowInfo.SamplesIntensityData; List <string> firstConditionAssociatedSamples = samplefileConditionRelation.GetValueOrDefault(firstCondition); List <string> secondConditionAssociatedSamples = samplefileConditionRelation.GetValueOrDefault(secondCondition); List <double> proteinFirstConditionIntensityValues = new List <double>(); List <double> proteinSecondConditionIntensityValues = new List <double>(); // get the protein's intensity values corresponding to the chosen pair of conditions foreach (string sampleFileName in samplesFileNames) { if (firstConditionAssociatedSamples.Contains(sampleFileName)) { proteinFirstConditionIntensityValues.Add(samplesintensityData[sampleFileName]); } if (secondConditionAssociatedSamples.Contains(sampleFileName)) { proteinSecondConditionIntensityValues.Add(samplesintensityData[sampleFileName]); } } // Compute observed N Values with the chosen pair of conditions using T-Tests and // store in observedNValues array List <double> proteinStatistics = statisticalTests.GetNValueUsingTTest(proteinFirstConditionIntensityValues, proteinSecondConditionIntensityValues, sOValue, false); // Compute permuted N Values with the chosen pair of conditions using T-Tests and // store in permutedNValues array List <double> proteinPermutedNavlues = statisticalTests.GetNValueUsingPermutationtests(proteinFirstConditionIntensityValues, proteinSecondConditionIntensityValues, sOValue); // add computed original and permuted statistics for the protein lock (allProteinObservedStatistics) { // add protein and its observed N value, P Value and Log Fold Change in that order allProteinObservedStatistics.Add(proteinRowInfo.ProteinID, new List <double>() { proteinStatistics[0], proteinStatistics[1], proteinStatistics[2] }); observedNValues.Add(proteinStatistics[0]); foreach (double permutedNValue in proteinPermutedNavlues) { permutedNValues.Add(permutedNValue); } } } }); // makes the permuted N values list and the observed N Values list of the same size proteinBasedSignificance.ResizePermutedArray(permutedNValues, permutedNValues.Count() - observedNValues.Count()); // get the threshold at which we will filter out the significant proteins double nValueThreshold = statisticalTests.calculateNvaluethreshold(observedNValues, permutedNValues, 0.05); // determine number of signifcant proteins detected int newSignificantCount = observedNValues.Count(x => x >= nValueThreshold); if (newSignificantCount > maxSignificantCount) { maxSignificantCount = newSignificantCount; proteinBasedSignificance.PrintSignificantProtein(allProteinInfo, nValueThreshold, allProteinObservedStatistics, "C:/Users/Anay/Desktop/UW Madison/Smith Lab/Project 1/ConsoleApp1/ProteinBasedSignificanceModified.csv"); Console.WriteLine("Sig Count - " + maxSignificantCount + "meanFraction - " + meanFraction + "sOValue - " + sOValue + "k - " + k); } } sOValue = sOValue + 0.1; } sOValue = 0.1; meanFraction = meanFraction + 0.3; } } }
public static void Main(string[] args) { Program proteinBasedSignificance = new Program(); // Parse the ExperimentalDesign File to get info of samples and conditions they belong to Dictionary <string, List <string> > samplefileConditionRelation = proteinBasedSignificance.ExpermientalDesignParser("C:/Users/Anay/Desktop/UW Madison/Smith Lab/Spectra Data/ExperimentalDesign.tsv"); // get all conditions and pair them up for Significance classification List <string> allConditions = new List <string>(samplefileConditionRelation.Keys); List <List <string> > allTwoConditionCombinations = proteinBasedSignificance.GenerateAllCombinationsOfTwoConditions(allConditions); foreach (List <string> conditionPair in allTwoConditionCombinations) { string firstCondition = conditionPair[0]; string secondCondition = conditionPair[1]; double sOValue = 0.1; double meanFraction = 0.1; int maxSignificantCount = 0; while (meanFraction < 1) { while (sOValue < 1) { for (int k = 1; k < 9; k++) { proteinBasedSignificance = new Program(); //Declaring variables which will be generated after parsing QuantifiedPeptides file List <ProteinRowInfo> allProteinInfo = new List <ProteinRowInfo>(); List <string> samplesFileNames = new List <string>(); int maxInvalidIntensityValues = k; proteinBasedSignificance.ProteinDataParser(allProteinInfo, maxInvalidIntensityValues, samplesFileNames, "C:/Users/Anay/Desktop/UW Madison/Smith Lab/Spectra Data/FlashLFQ_2020-04-26-17-39-35/QuantifiedProteins.tsv"); // imputes missing intensity values for each protein ImputationProcess imputationProcess = new ImputationProcess(); imputationProcess.RunImputationProcess(allProteinInfo, samplesFileNames, meanFraction); // Declaring variables which will be generated after T-Tests and Permutation Tests List <double> actualNValues = new List <double>(); // will store actual(real) N values List <double> actualPValues = new List <double>(); // will store actual(real) P values List <double> actualLogFoldChange = new List <double>(); // will store actual(real) Log Fold Change values List <double> permutedNValues = new List <double>(); // will store permuted(fake) N values StatisticalTests statisticalTests = new StatisticalTests(); // Compute actual and permuted N Values for each protein using T Tests and Permutation Testing for (int i = 0; i < allProteinInfo.Count; i++) { ProteinRowInfo proteinRowInfo = allProteinInfo[i]; Dictionary <string, double> samplesintensityData = proteinRowInfo.SamplesIntensityData; List <string> firstConditionAssociatedSamples = samplefileConditionRelation.GetValueOrDefault(firstCondition); List <string> secondConditionAssociatedSamples = samplefileConditionRelation.GetValueOrDefault(secondCondition); List <double> proteinFirstConditionIntensityValues = new List <double>(); List <double> proteinSecondConditionIntensityValues = new List <double>(); // get the protein's intensity values corresponding to the chosen pair of conditions foreach (string sampleFileName in samplesFileNames) { if (firstConditionAssociatedSamples.Contains(sampleFileName)) { proteinFirstConditionIntensityValues.Add(samplesintensityData[sampleFileName]); } if (secondConditionAssociatedSamples.Contains(sampleFileName)) { proteinSecondConditionIntensityValues.Add(samplesintensityData[sampleFileName]); } } // Compute actual(real) N Values with the chosen pair of conditions using T-Tests and // store in actualNValues array statisticalTests.GetNValueUsingTTest(proteinFirstConditionIntensityValues, proteinSecondConditionIntensityValues, actualNValues, actualPValues, actualLogFoldChange, sOValue); // Compute permuted(fake) N Values with the chosen pair of conditions using T-Tests and // store in permutedNValues array statisticalTests.GetNValueUsingPermutationtests(proteinFirstConditionIntensityValues, proteinSecondConditionIntensityValues, permutedNValues, sOValue); } // makes the permuted N values list and the actual N Values list of the same size proteinBasedSignificance.ResizePermutedArray(permutedNValues, permutedNValues.Count() - actualNValues.Count()); // Copy of the actual N values which will be used when determind the N Value threshold for target FDR List <double> actualNValuesCopy = new List <double>(); for (int i = 0; i < actualNValues.Count; i++) { actualNValuesCopy.Add(actualNValues[i]); } // get the threshold at which we will filter out the significant proteins double nValueThreshold = statisticalTests.calculateNvaluethreshold(actualNValuesCopy, permutedNValues, 0.05); // determine number of signifcant proteins detected int newSignificantCount = actualNValues.Count(x => x >= nValueThreshold); if (newSignificantCount > maxSignificantCount) { maxSignificantCount = newSignificantCount; proteinBasedSignificance.PrintSignificantProtein(allProteinInfo, actualNValues, nValueThreshold, actualPValues, actualLogFoldChange, "C:/Users/Anay/Desktop/UW Madison/Smith Lab/Project 1/ConsoleApp1/ProteinBaseedSignificance.csv"); } } sOValue = sOValue + 0.1; } sOValue = 0.1; meanFraction = meanFraction + 0.3; } } }
public void TestCalculateNumberOfIntensityValuesOfSample() { int[] expectedSampleIntensityValuesCount = new int[8]; expectedSampleIntensityValuesCount[0] = 0; expectedSampleIntensityValuesCount[1] = 1; expectedSampleIntensityValuesCount[2] = 1; expectedSampleIntensityValuesCount[3] = 1; expectedSampleIntensityValuesCount[4] = 0; expectedSampleIntensityValuesCount[5] = 1; expectedSampleIntensityValuesCount[6] = 1; expectedSampleIntensityValuesCount[7] = 0; List <string> sampleFileNames = new List <string>(); sampleFileNames.Add("Intensity_B02_06_161103_A1_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_07_161103_A2_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_16_161103_A3_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_17_161103_A4_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_24_161103_C1_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_09_161103_C2_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_14_161103_C3_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_19_161103_C4_HCD_OT_4ul"); ProteinRowInfo sampleprotein1 = new ProteinRowInfo(); sampleprotein1.ProteinID = "A0A023KES1"; sampleprotein1.SamplesIntensityData["Intensity_B02_06_161103_A1_HCD_OT_4ul"] = 0; sampleprotein1.SamplesIntensityData["Intensity_B02_07_161103_A2_HCD_OT_4ul"] = 22.729; sampleprotein1.SamplesIntensityData["Intensity_B02_16_161103_A3_HCD_OT_4ul"] = 22.347; sampleprotein1.SamplesIntensityData["Intensity_B02_17_161103_A4_HCD_OT_4ul"] = 22.397; sampleprotein1.SamplesIntensityData["Intensity_B02_24_161103_C1_HCD_OT_4ul"] = 0; sampleprotein1.SamplesIntensityData["Intensity_B02_09_161103_C2_HCD_OT_4ul"] = 23.180; sampleprotein1.SamplesIntensityData["Intensity_B02_14_161103_C3_HCD_OT_4ul"] = 23.293; sampleprotein1.SamplesIntensityData["Intensity_B02_19_161103_C4_HCD_OT_4ul"] = 0; ProteinRowInfo sampleprotein2 = new ProteinRowInfo(); sampleprotein2.ProteinID = "A0A024R4E5"; sampleprotein2.SamplesIntensityData["Intensity_B02_06_161103_A1_HCD_OT_4ul"] = 0; sampleprotein2.SamplesIntensityData["Intensity_B02_07_161103_A2_HCD_OT_4ul"] = 25.535; sampleprotein2.SamplesIntensityData["Intensity_B02_16_161103_A3_HCD_OT_4ul"] = 0; sampleprotein2.SamplesIntensityData["Intensity_B02_17_161103_A4_HCD_OT_4ul"] = 0; sampleprotein2.SamplesIntensityData["Intensity_B02_24_161103_C1_HCD_OT_4ul"] = 25.370; sampleprotein2.SamplesIntensityData["Intensity_B02_09_161103_C2_HCD_OT_4ul"] = 24; sampleprotein2.SamplesIntensityData["Intensity_B02_14_161103_C3_HCD_OT_4ul"] = 25.359; sampleprotein2.SamplesIntensityData["Intensity_B02_19_161103_C4_HCD_OT_4ul"] = 0; int[] outputSampleIntensityValuesCount = new int[8]; double[] outputSampleAllIntensityValuesSum = new double[8]; ImputationProcess imputationProcess = new ImputationProcess(); imputationProcess.CalculateNumberAndSumOfIntensityValuesOfSample(sampleprotein1, outputSampleAllIntensityValuesSum, sampleFileNames, outputSampleIntensityValuesCount); CollectionAssert.AreEqual(expectedSampleIntensityValuesCount, outputSampleIntensityValuesCount); expectedSampleIntensityValuesCount[0] = 0; expectedSampleIntensityValuesCount[1] = 2; expectedSampleIntensityValuesCount[2] = 1; expectedSampleIntensityValuesCount[3] = 1; expectedSampleIntensityValuesCount[4] = 1; expectedSampleIntensityValuesCount[5] = 2; expectedSampleIntensityValuesCount[6] = 2; expectedSampleIntensityValuesCount[7] = 0; imputationProcess.CalculateNumberAndSumOfIntensityValuesOfSample(sampleprotein2, outputSampleAllIntensityValuesSum, sampleFileNames, outputSampleIntensityValuesCount); CollectionAssert.AreEqual(expectedSampleIntensityValuesCount, outputSampleIntensityValuesCount); }
public void TestProteinDataParserProteinIntensityMapping() { List <string> sampleFileNames = new List <string>(); sampleFileNames.Add("Intensity_B02_06_161103_A1_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_07_161103_A2_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_16_161103_A3_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_17_161103_A4_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_24_161103_C1_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_09_161103_C2_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_14_161103_C3_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_19_161103_C4_HCD_OT_4ul"); List <ProteinRowInfo> expectedAllProteinInfo = new List <ProteinRowInfo>(); ProteinRowInfo sampleprotein1 = new ProteinRowInfo(); sampleprotein1.ProteinID = "A0A023KES1"; sampleprotein1.SamplesIntensityData["Intensity_B02_06_161103_A1_HCD_OT_4ul"] = 22.192; sampleprotein1.SamplesIntensityData["Intensity_B02_07_161103_A2_HCD_OT_4ul"] = 22.729; sampleprotein1.SamplesIntensityData["Intensity_B02_16_161103_A3_HCD_OT_4ul"] = 22.347; sampleprotein1.SamplesIntensityData["Intensity_B02_17_161103_A4_HCD_OT_4ul"] = 22.397; sampleprotein1.SamplesIntensityData["Intensity_B02_24_161103_C1_HCD_OT_4ul"] = 23.489; sampleprotein1.SamplesIntensityData["Intensity_B02_09_161103_C2_HCD_OT_4ul"] = 23.180; sampleprotein1.SamplesIntensityData["Intensity_B02_14_161103_C3_HCD_OT_4ul"] = 23.293; sampleprotein1.SamplesIntensityData["Intensity_B02_19_161103_C4_HCD_OT_4ul"] = 23.271; ProteinRowInfo sampleprotein2 = new ProteinRowInfo(); sampleprotein2.ProteinID = "A0A024R4E5"; sampleprotein2.SamplesIntensityData["Intensity_B02_06_161103_A1_HCD_OT_4ul"] = 25.535; sampleprotein2.SamplesIntensityData["Intensity_B02_07_161103_A2_HCD_OT_4ul"] = 25.482; sampleprotein2.SamplesIntensityData["Intensity_B02_16_161103_A3_HCD_OT_4ul"] = 25.308; sampleprotein2.SamplesIntensityData["Intensity_B02_17_161103_A4_HCD_OT_4ul"] = 25.373; sampleprotein2.SamplesIntensityData["Intensity_B02_24_161103_C1_HCD_OT_4ul"] = 25.370; sampleprotein2.SamplesIntensityData["Intensity_B02_09_161103_C2_HCD_OT_4ul"] = 25.368; sampleprotein2.SamplesIntensityData["Intensity_B02_14_161103_C3_HCD_OT_4ul"] = 25.359; sampleprotein2.SamplesIntensityData["Intensity_B02_19_161103_C4_HCD_OT_4ul"] = 25.251; ProteinRowInfo sampleprotein3 = new ProteinRowInfo(); sampleprotein3.ProteinID = "A0A087WTI9"; sampleprotein3.SamplesIntensityData["Intensity_B02_06_161103_A1_HCD_OT_4ul"] = 0; sampleprotein3.SamplesIntensityData["Intensity_B02_07_161103_A2_HCD_OT_4ul"] = 23.763; sampleprotein3.SamplesIntensityData["Intensity_B02_16_161103_A3_HCD_OT_4ul"] = 20.783; sampleprotein3.SamplesIntensityData["Intensity_B02_17_161103_A4_HCD_OT_4ul"] = 20.165; sampleprotein3.SamplesIntensityData["Intensity_B02_24_161103_C1_HCD_OT_4ul"] = 21.7858; sampleprotein3.SamplesIntensityData["Intensity_B02_09_161103_C2_HCD_OT_4ul"] = 0; sampleprotein3.SamplesIntensityData["Intensity_B02_14_161103_C3_HCD_OT_4ul"] = 21.564; sampleprotein3.SamplesIntensityData["Intensity_B02_19_161103_C4_HCD_OT_4ul"] = 20.469; expectedAllProteinInfo.Add(sampleprotein1); expectedAllProteinInfo.Add(sampleprotein2); expectedAllProteinInfo.Add(sampleprotein3); int maxAllowedMissingValues = 2; List <ProteinRowInfo> outputAllProteinInfo = new List <ProteinRowInfo>(); List <string> outputSamplesFileNames = new List <string>(); RunProteinSignificanceClassifier runProteinSignificanceClassifier = new RunProteinSignificanceClassifier(); runProteinSignificanceClassifier.ProteinDataParser(outputAllProteinInfo, maxAllowedMissingValues, outputSamplesFileNames, "TestQuantifiedProteins.tsv"); foreach (ProteinRowInfo outputProtein in outputAllProteinInfo) { if (outputProtein.ProteinID.Equals(sampleprotein1.ProteinID)) { Assert.That(sampleprotein1.SamplesIntensityData, Is.EqualTo(outputProtein.SamplesIntensityData).Within(0.001)); } else if (outputProtein.ProteinID.Equals(sampleprotein2.ProteinID)) { Assert.That(sampleprotein2.SamplesIntensityData, Is.EqualTo(outputProtein.SamplesIntensityData).Within(0.001)); } else if (outputProtein.ProteinID.Equals(sampleprotein3.ProteinID)) { Assert.That(sampleprotein3.SamplesIntensityData, Is.EqualTo(outputProtein.SamplesIntensityData).Within(0.001)); } else { Assert.Fail(); } } }
public void TestImputeData() { ProteinRowInfo expectedSampleprotein = new ProteinRowInfo(); expectedSampleprotein.ProteinID = "A0A087WTI9"; expectedSampleprotein.SamplesIntensityData["Intensity_B02_06_161103_A1_HCD_OT_4ul"] = 24.607; expectedSampleprotein.SamplesIntensityData["Intensity_B02_07_161103_A2_HCD_OT_4ul"] = 23.763; expectedSampleprotein.SamplesIntensityData["Intensity_B02_16_161103_A3_HCD_OT_4ul"] = 20.783; expectedSampleprotein.SamplesIntensityData["Intensity_B02_17_161103_A4_HCD_OT_4ul"] = 20.165; expectedSampleprotein.SamplesIntensityData["Intensity_B02_24_161103_C1_HCD_OT_4ul"] = 20.639; expectedSampleprotein.SamplesIntensityData["Intensity_B02_09_161103_C2_HCD_OT_4ul"] = 17.223; expectedSampleprotein.SamplesIntensityData["Intensity_B02_14_161103_C3_HCD_OT_4ul"] = 21.564; expectedSampleprotein.SamplesIntensityData["Intensity_B02_19_161103_C4_HCD_OT_4ul"] = 20.469; List <string> sampleFileNames = new List <string>(); sampleFileNames.Add("Intensity_B02_06_161103_A1_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_07_161103_A2_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_16_161103_A3_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_17_161103_A4_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_24_161103_C1_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_09_161103_C2_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_14_161103_C3_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_19_161103_C4_HCD_OT_4ul"); double[] sampleMissingCount = new double[8]; sampleMissingCount[0] = 20; sampleMissingCount[1] = 7; sampleMissingCount[2] = 11; sampleMissingCount[3] = 0; sampleMissingCount[4] = 32; sampleMissingCount[5] = 2; sampleMissingCount[6] = 0; sampleMissingCount[7] = 5; int[] numberOfValidIntensityValuesInSample = new int[8]; numberOfValidIntensityValuesInSample[0] = 120; numberOfValidIntensityValuesInSample[1] = 135; numberOfValidIntensityValuesInSample[2] = 100; numberOfValidIntensityValuesInSample[3] = 87; numberOfValidIntensityValuesInSample[4] = 189; numberOfValidIntensityValuesInSample[5] = 50; numberOfValidIntensityValuesInSample[6] = 79; numberOfValidIntensityValuesInSample[7] = 30; double[] samplesMeanIntensityValue = new double[8]; samplesMeanIntensityValue[0] = 24.4; samplesMeanIntensityValue[1] = 23.5; samplesMeanIntensityValue[2] = 22; samplesMeanIntensityValue[3] = 19.7; samplesMeanIntensityValue[4] = 21.75; samplesMeanIntensityValue[5] = 18.2; samplesMeanIntensityValue[6] = 23.21; samplesMeanIntensityValue[7] = 21.09; double[] samplesStandardDeviation = new double[8]; samplesStandardDeviation[0] = 0.2; samplesStandardDeviation[1] = 1.1; samplesStandardDeviation[2] = 0.02; samplesStandardDeviation[3] = 0.7; samplesStandardDeviation[4] = 1.7; samplesStandardDeviation[5] = 0.8; samplesStandardDeviation[6] = 2.4; samplesStandardDeviation[7] = 0.15; ProteinRowInfo outputSampleprotein = new ProteinRowInfo(); outputSampleprotein.ProteinID = "A0A087WTI9"; outputSampleprotein.SamplesIntensityData["Intensity_B02_06_161103_A1_HCD_OT_4ul"] = 0; outputSampleprotein.SamplesIntensityData["Intensity_B02_07_161103_A2_HCD_OT_4ul"] = 23.763; outputSampleprotein.SamplesIntensityData["Intensity_B02_16_161103_A3_HCD_OT_4ul"] = 20.783; outputSampleprotein.SamplesIntensityData["Intensity_B02_17_161103_A4_HCD_OT_4ul"] = 20.165; outputSampleprotein.SamplesIntensityData["Intensity_B02_24_161103_C1_HCD_OT_4ul"] = 0; outputSampleprotein.SamplesIntensityData["Intensity_B02_09_161103_C2_HCD_OT_4ul"] = 0; outputSampleprotein.SamplesIntensityData["Intensity_B02_14_161103_C3_HCD_OT_4ul"] = 21.564; outputSampleprotein.SamplesIntensityData["Intensity_B02_19_161103_C4_HCD_OT_4ul"] = 20.469; double meanFraction = 0.2; ImputationProcess imputationProcess = new ImputationProcess(); imputationProcess.ImputeData(outputSampleprotein, samplesMeanIntensityValue, samplesStandardDeviation, sampleFileNames, sampleMissingCount, numberOfValidIntensityValuesInSample, meanFraction, true); Assert.That(expectedSampleprotein.SamplesIntensityData, Is.EqualTo(outputSampleprotein.SamplesIntensityData).Within(0.001)); }
public void TestCalculateSampleStandardDeviationNumerator() { List <string> sampleFileNames = new List <string>(); sampleFileNames.Add("Intensity_B02_06_161103_A1_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_07_161103_A2_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_16_161103_A3_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_17_161103_A4_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_24_161103_C1_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_09_161103_C2_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_14_161103_C3_HCD_OT_4ul"); sampleFileNames.Add("Intensity_B02_19_161103_C4_HCD_OT_4ul"); ProteinRowInfo sampleprotein1 = new ProteinRowInfo(); sampleprotein1.ProteinID = "A0A023KES1"; sampleprotein1.SamplesIntensityData["Intensity_B02_06_161103_A1_HCD_OT_4ul"] = 0; sampleprotein1.SamplesIntensityData["Intensity_B02_07_161103_A2_HCD_OT_4ul"] = 22.729; sampleprotein1.SamplesIntensityData["Intensity_B02_16_161103_A3_HCD_OT_4ul"] = 22.347; sampleprotein1.SamplesIntensityData["Intensity_B02_17_161103_A4_HCD_OT_4ul"] = 22.397; sampleprotein1.SamplesIntensityData["Intensity_B02_24_161103_C1_HCD_OT_4ul"] = 0; sampleprotein1.SamplesIntensityData["Intensity_B02_09_161103_C2_HCD_OT_4ul"] = 23.180; sampleprotein1.SamplesIntensityData["Intensity_B02_14_161103_C3_HCD_OT_4ul"] = 23.293; sampleprotein1.SamplesIntensityData["Intensity_B02_19_161103_C4_HCD_OT_4ul"] = 0; ProteinRowInfo sampleprotein2 = new ProteinRowInfo(); sampleprotein2.ProteinID = "A0A024R4E5"; sampleprotein2.SamplesIntensityData["Intensity_B02_06_161103_A1_HCD_OT_4ul"] = 0; sampleprotein2.SamplesIntensityData["Intensity_B02_07_161103_A2_HCD_OT_4ul"] = 25.535; sampleprotein2.SamplesIntensityData["Intensity_B02_16_161103_A3_HCD_OT_4ul"] = 0; sampleprotein2.SamplesIntensityData["Intensity_B02_17_161103_A4_HCD_OT_4ul"] = 0; sampleprotein2.SamplesIntensityData["Intensity_B02_24_161103_C1_HCD_OT_4ul"] = 25.370; sampleprotein2.SamplesIntensityData["Intensity_B02_09_161103_C2_HCD_OT_4ul"] = 24; sampleprotein2.SamplesIntensityData["Intensity_B02_14_161103_C3_HCD_OT_4ul"] = 25.359; sampleprotein2.SamplesIntensityData["Intensity_B02_19_161103_C4_HCD_OT_4ul"] = 0; double[] samplesMeanIntensityValue = new double[8]; samplesMeanIntensityValue[0] = 0; samplesMeanIntensityValue[1] = (22.729 + 25.535) / 2; samplesMeanIntensityValue[2] = 22.347 / 2; samplesMeanIntensityValue[3] = 22.397 / 2; samplesMeanIntensityValue[4] = 25.370 / 2; samplesMeanIntensityValue[5] = (23.180 + 24) / 2; samplesMeanIntensityValue[6] = (23.293 + 25.359) / 2; samplesMeanIntensityValue[7] = 0; double[] expectedSamplesStandardDeviationNumerators = new double[8]; expectedSamplesStandardDeviationNumerators[0] = 0; expectedSamplesStandardDeviationNumerators[1] = Math.Pow(22.729 - samplesMeanIntensityValue[1], 2); expectedSamplesStandardDeviationNumerators[2] = Math.Pow(22.347 - samplesMeanIntensityValue[2], 2); expectedSamplesStandardDeviationNumerators[3] = Math.Pow(22.397 - samplesMeanIntensityValue[3], 2); expectedSamplesStandardDeviationNumerators[4] = 0; expectedSamplesStandardDeviationNumerators[5] = Math.Pow(23.180 - samplesMeanIntensityValue[5], 2); expectedSamplesStandardDeviationNumerators[6] = Math.Pow(23.293 - samplesMeanIntensityValue[6], 2); expectedSamplesStandardDeviationNumerators[7] = 0; double[] outputSamplesStandardDeviationNumerators = new double[8]; ImputationProcess imputationProcess = new ImputationProcess(); imputationProcess.CalculateSampleStandardDeviationNumerator(sampleprotein1, outputSamplesStandardDeviationNumerators, sampleFileNames, samplesMeanIntensityValue); CollectionAssert.AreEqual(expectedSamplesStandardDeviationNumerators, outputSamplesStandardDeviationNumerators); expectedSamplesStandardDeviationNumerators[0] = 0; expectedSamplesStandardDeviationNumerators[1] = Math.Pow(22.729 - samplesMeanIntensityValue[1], 2) + Math.Pow(25.535 - samplesMeanIntensityValue[1], 2); expectedSamplesStandardDeviationNumerators[2] = Math.Pow(22.347 - samplesMeanIntensityValue[2], 2); expectedSamplesStandardDeviationNumerators[3] = Math.Pow(22.397 - samplesMeanIntensityValue[3], 2); expectedSamplesStandardDeviationNumerators[4] = Math.Pow(25.370 - samplesMeanIntensityValue[4], 2); expectedSamplesStandardDeviationNumerators[5] = Math.Pow(23.180 - samplesMeanIntensityValue[5], 2) + Math.Pow(24 - samplesMeanIntensityValue[5], 2); expectedSamplesStandardDeviationNumerators[6] = Math.Pow(23.293 - samplesMeanIntensityValue[6], 2) + Math.Pow(25.359 - samplesMeanIntensityValue[6], 2); expectedSamplesStandardDeviationNumerators[7] = 0; imputationProcess.CalculateSampleStandardDeviationNumerator(sampleprotein2, outputSamplesStandardDeviationNumerators, sampleFileNames, samplesMeanIntensityValue); CollectionAssert.AreEqual(expectedSamplesStandardDeviationNumerators, outputSamplesStandardDeviationNumerators); }