protected DataTable ParseFromFile(string filePath, CIStringCollection items, CIStringCollection experiments) { DataTable results; FileServer fServer; string[][] fileContents; fServer = new FileServer(); OnStatusChange("Retrieving genotypes from file"); fileContents = fServer.ReadMultipleColumns(filePath, 3); results = new DataTable("Genotype"); results.Columns.Add("Item", Type.GetType("System.String")); results.Columns.Add("Experiment", Type.GetType("System.String")); results.Columns.Add("Alleles", Type.GetType("System.String")); for (int i = 0; i < fileContents.GetLength(0); i++) { if (items.Contains(fileContents[i][0].Trim()) && experiments.Contains(fileContents[i][1].Trim())) { results.Rows.Add(new object[] { fileContents[i][0], fileContents[i][1], fileContents[i][2] }); } if (i % 1000 == 0) { OnStatusChange("Finished reading line " + i.ToString()); } } return(results); }
private CIStringCollection Union(CIStringCollection values1, CIStringCollection values2, out CIStringCollection values1NotIn2, out CIStringCollection values2NotIn1) { //Sort out values which are both in source 1 and source 2, and also save //those which are only in source 1. CIStringCollection valuesUnion; valuesUnion = new CIStringCollection(); values1NotIn2 = new CIStringCollection(); values2NotIn1 = new CIStringCollection(); foreach (string tempValue in values1) { if (values2.Contains(tempValue)) { if (!valuesUnion.Contains(tempValue)) { valuesUnion.Add(tempValue); } } else { if (!values1NotIn2.Contains(tempValue)) { values1NotIn2.Add(tempValue); } } } //Save values which are only in source 2. foreach (string tempValue in values2) { if (!values1.Contains(tempValue)) { if (!values2NotIn1.Contains(tempValue)) { values2NotIn1.Add(tempValue); } } } return(valuesUnion); }
public void HarmonizePolarities(GenotypeDictionary dict1, GenotypeDictionary dict2, out GenotypeDictionary harmonizedDict1, out GenotypeDictionary harmonizedDict2) { //Harmonizes dict2 after dict1 and removes all entries from A/T or G/C experiments and incompatible //SNP types and items from SNPs which are monomorphic in both source. string tempExp, tempAlleles1, tempAlleles2; CIStringCollection uniqueExps; Dictionary <string, string> allelesDict1, allelesDict2; int counter; //Find all experiments and their SNP types (the experiment names should be the same in both dictionaries, use dict1 here). uniqueExps = new CIStringCollection(); allelesDict1 = new Dictionary <string, string>(StringComparer.InvariantCultureIgnoreCase); allelesDict2 = new Dictionary <string, string>(StringComparer.InvariantCultureIgnoreCase); OnStatusChange("Extracting experiments and SNP types"); counter = 0; foreach (string tempKey in dict1.Keys) { tempExp = dict1.GetExperiment(tempKey); tempAlleles1 = dict1[tempKey].ToUpper(); tempAlleles2 = dict2[tempKey].ToUpper(); if (!uniqueExps.Contains(tempExp)) { //This experiment has not been processed before. uniqueExps.Add(tempExp); //Get the unique alleles from dictionary 1. if (tempAlleles1.Substring(0, 1) != tempAlleles1.Substring(2, 1)) { //The alleles are different, add both. allelesDict1.Add(tempExp, tempAlleles1.Substring(0, 1) + tempAlleles1.Substring(2, 1)); } else { //The alleles are the same, add only the first. allelesDict1.Add(tempExp, tempAlleles1.Substring(0, 1)); } //Get the unique alleles from dictionary 2. if (tempAlleles2.Substring(0, 1) != tempAlleles2.Substring(2, 1)) { //The alleles are different, add both. allelesDict2.Add(tempExp, tempAlleles2.Substring(0, 1) + tempAlleles2.Substring(2, 1)); } else { //The alleles are the same, add only the first. allelesDict2.Add(tempExp, tempAlleles2.Substring(0, 1)); } } else { //This experiment has been processed before. //Add the first allele from dictionary one if it does not already exist in the string. if (!allelesDict1[tempExp].Contains(tempAlleles1.Substring(0, 1))) { allelesDict1[tempExp] = allelesDict1[tempExp] + tempAlleles1.Substring(0, 1); } //Add the second allele from dictionary one if it does not already exist in the string. if (!allelesDict1[tempExp].Contains(tempAlleles1.Substring(2, 1))) { allelesDict1[tempExp] = allelesDict1[tempExp] + tempAlleles1.Substring(2, 1); } //Add the first allele from dictionary two if it does not already exist in the string. if (!allelesDict2[tempExp].Contains(tempAlleles2.Substring(0, 1))) { allelesDict2[tempExp] = allelesDict2[tempExp] + tempAlleles2.Substring(0, 1); } //Add the second allele from dictionary one if it does not already exist in the string. if (!allelesDict2[tempExp].Contains(tempAlleles2.Substring(2, 1))) { allelesDict2[tempExp] = allelesDict2[tempExp] + tempAlleles2.Substring(2, 1); } } counter++; if (counter % 100 == 0) { OnStatusChange("Extracting experiments and SNP types (processed genotype " + counter + ")"); } } //Remove data from SNPs with more than two alleles in source 1 from both dictionaries and save. MyRemovedExpTooManyAlleles1 = new CIStringCollection(); OnStatusChange("Extracting harmonizable data step 1/6"); counter = 0; foreach (string exp in allelesDict1.Keys) { if (allelesDict1[exp].Length > 2) { dict1.RemoveExperiment(exp); dict2.RemoveExperiment(exp); MyRemovedExpTooManyAlleles1.Add(exp); } counter++; if (counter % 100 == 0) { OnStatusChange("Extracting harmonizable data step 1/6 (processed experiment " + counter + ")"); } } //Remove data from SNPs with more than two alleles in source 2 from both dictionaries and save. MyRemovedExpTooManyAlleles2 = new CIStringCollection(); OnStatusChange("Extracting harmonizable data step 2/6"); counter = 0; foreach (string exp in allelesDict2.Keys) { if (allelesDict2[exp].Length > 2) { dict1.RemoveExperiment(exp); dict2.RemoveExperiment(exp); MyRemovedExpTooManyAlleles2.Add(exp); } counter++; if (counter % 100 == 0) { OnStatusChange("Extracting harmonizable data step 2/6 (processed experiment " + counter + ")"); } } //Remove data from SNPs which are monomorphic in both sources and save. MyRemovedExpMonomorphicInBoth = new CIStringCollection(); OnStatusChange("Extracting harmonizable data step 3/6"); counter = 0; foreach (string exp in allelesDict1.Keys) { if (allelesDict1[exp].Length == 1 && allelesDict2[exp].Length == 1) { dict1.RemoveExperiment(exp); dict2.RemoveExperiment(exp); MyRemovedExpMonomorphicInBoth.Add(exp); } counter++; if (counter % 100 == 0) { OnStatusChange("Extracting harmonizable data step 3/6 (processed experiment " + counter + ")"); } } //Remove data from SNPs with incompatible SNP types from both dictionaries and save. MyRemovedExpIncompatibleSNPTypes = new CIStringCollection(); OnStatusChange("Extracting harmonizable data step 4/6"); counter = 0; foreach (string exp in allelesDict1.Keys) { if (!MyRemovedExpTooManyAlleles1.Contains(exp) && !MyRemovedExpTooManyAlleles2.Contains(exp) && !MyRemovedExpMonomorphicInBoth.Contains(exp)) { if (!this.IsCompatibleSNPTypes(allelesDict1[exp], allelesDict2[exp])) { dict1.RemoveExperiment(exp); dict2.RemoveExperiment(exp); MyRemovedExpIncompatibleSNPTypes.Add(exp); } } counter++; if (counter % 100 == 0) { OnStatusChange("Extracting harmonizable data step 4/6 (processed experiment " + counter + ")"); } } //Remove data from SNPs with A/T or G/C SNPs from both dictionaries and save. MyRemovedExpATCG = new CIStringCollection(); OnStatusChange("Extracting harmonizable data step 5/6"); counter = 0; foreach (string exp in allelesDict1.Keys) { if (!MyRemovedExpTooManyAlleles1.Contains(exp) && !MyRemovedExpTooManyAlleles2.Contains(exp) && !MyRemovedExpMonomorphicInBoth.Contains(exp) && !MyRemovedExpIncompatibleSNPTypes.Contains(exp)) { if (allelesDict1[exp].Length > 1) { if (this.GetNumberOfATalleles(allelesDict1[exp]) != 1) { //Either 0 or 2 A or T alleles. dict1.RemoveExperiment(exp); dict2.RemoveExperiment(exp); MyRemovedExpATCG.Add(exp); } } else { if (this.GetNumberOfATalleles(allelesDict2[exp]) != 1) { dict1.RemoveExperiment(exp); dict2.RemoveExperiment(exp); MyRemovedExpATCG.Add(exp); } } } counter++; if (counter % 100 == 0) { OnStatusChange("Extracting harmonizable data step 5/6 (processed experiment " + counter + ")"); } } //Store only data from experiments passing all tests. harmonizedDict1 = new GenotypeDictionary(); harmonizedDict2 = new GenotypeDictionary(); OnStatusChange("Extracting harmonizable data step 6/6"); counter = 0; foreach (string tempKey in dict1.Keys) { tempExp = dict1.GetExperiment(tempKey); if (!MyRemovedExpTooManyAlleles1.Contains(tempExp) && !MyRemovedExpTooManyAlleles2.Contains(tempExp) && !MyRemovedExpMonomorphicInBoth.Contains(tempExp) && !MyRemovedExpIncompatibleSNPTypes.Contains(tempExp) && !MyRemovedExpATCG.Contains(tempExp)) { harmonizedDict1.Add(tempKey, dict1[tempKey]); harmonizedDict2.Add(tempKey, dict2[tempKey]); } counter++; if (counter % 100 == 0) { OnStatusChange("Extracting harmonizable data step 6/6 (processed genotype " + counter + ")"); } } //Go through dict1. If the SNP type for the current row is different than //the SNP type in dict2, convert the row in dict2. OnStatusChange("Harmonizing"); counter = 0; foreach (string tempKey in harmonizedDict1.Keys) { tempExp = harmonizedDict1.GetExperiment(tempKey); tempAlleles1 = allelesDict1[tempExp]; tempAlleles2 = allelesDict2[tempExp]; if (tempAlleles1.Length == 2 && tempAlleles2.Length == 2) { if (( (tempAlleles1.Substring(0, 1) == tempAlleles2.Substring(0, 1)) && (tempAlleles1.Substring(1, 1) == tempAlleles2.Substring(1, 1)) ) || ( (tempAlleles1.Substring(0, 1) == tempAlleles2.Substring(1, 1)) && (tempAlleles1.Substring(1, 1) == tempAlleles2.Substring(0, 1)) ) ) { //They are the same. } else { harmonizedDict2[tempKey] = this.ReverseComplement(harmonizedDict2[tempKey]); } } else if (tempAlleles1.Length == 1 && tempAlleles2.Length == 2) { if (tempAlleles1 != tempAlleles2.Substring(0, 1) && tempAlleles1 != tempAlleles2.Substring(1, 1)) { harmonizedDict2[tempKey] = this.ReverseComplement(harmonizedDict2[tempKey]); } } else if (tempAlleles1.Length == 2 && tempAlleles2.Length == 1) { if (tempAlleles2 != tempAlleles1.Substring(0, 1) && tempAlleles2 != tempAlleles1.Substring(1, 1)) { harmonizedDict2[tempKey] = this.ReverseComplement(harmonizedDict2[tempKey]); } } else { throw new Exception("Inappropriate number of alleles detected when converting SNP types"); } counter++; if (counter % 100 == 0) { OnStatusChange("Harmonizing (processed genotype " + counter + ")"); } } }