public bool CheckRow(Dictionary <string, string> row, Compound pubChem, CompoundData kegg) { var rowFormula = row["formula"]; var rowCas = row["cas"]; var rowMass = (int)double.Parse(row["mass"]); var pubFormula = ""; var pubMass = 0.0; if (pubChem != null) { pubFormula = pubChem.findProp("Molecular Formula").sval; pubMass = pubChem.findProp("MonoIsotopic").fval; } if (kegg != null) { var keggFormula = kegg.Formula; var keggExactMass = kegg.ExactMass; var keggCas = kegg.OtherId("CAS"); return(rowFormula == keggFormula && rowFormula == pubFormula && rowCas == keggCas && rowMass == (int)keggExactMass && rowMass == (int)pubMass); } return(rowFormula == pubFormula && rowMass == (int)pubMass); }
private string printKegg(CompoundData k) { if (k != null) { return($"{"KEGG",10}{k.KeggId,10}" + $"{(int)k.ExactMass,20}" + $"{k.Formula,20}{k.OtherId("CAS"),20}\n"); } return("No Kegg\n"); }
private void WriteContentToFile(TextWriter file, IReadOnlyDictionary <string, string> row, Compound pubChem, CompoundData kegg, int rowIndex) { file.Write(printHead(rowIndex)); file.Write(printRow(row)); file.Write(printKegg(kegg)); file.Write(printPubChem(pubChem)); file.Write("\n"); }
/// <summary> /// Initialization function that controls the program /// </summary> /// <param name="options">Processing options</param> /// <returns>True on success, false if an error</returns> private bool ProcessMetabolites(MetaboliteValidatorOptions options) { try { if (string.IsNullOrWhiteSpace(options.InputFile)) { Console.WriteLine(); Console.WriteLine("Error, input file not defined"); return(false); } var inputFile = new FileInfo(options.InputFile); if (!inputFile.Exists) { Console.WriteLine(); Console.WriteLine("Error, input file not found: " + inputFile.FullName); return(false); } // init github api interaction with the repo and owner var github = new Github("MetabolomicsCCS", "PNNL-Comp-Mass-Spec", options.Preview); if (!string.IsNullOrEmpty(options.Username)) { github.Username = options.Username; if (!string.IsNullOrEmpty(options.Password)) { if (options.Password.StartsWith("*")) { github.Password = MetaboliteValidatorOptions.DecodePassword(options.Password.Substring(1)); } else { github.Password = options.Password; } } } // get main data file from github var dataFile = github.GetFile("data/" + MASTER_TSV_FILE); // parse the new data to append to current data var fileToAppend = new DelimitedFileParser(); fileToAppend.ParseFile(inputFile.FullName, '\t'); Console.WriteLine(); Console.WriteLine("Found {0} records in local file {1}", fileToAppend.Count(), inputFile.Name); // Update column names if necessary UpdateHeaders(fileToAppend); // parse the main data file from github var mainFile = new DelimitedFileParser(); if (dataFile == null) { mainFile.SetDelimiter('\t'); mainFile.SetHeaders(fileToAppend.GetHeaders()); } else { mainFile.ParseString(dataFile, '\t'); Console.WriteLine(); Console.WriteLine("Found {0} records in file {1} retrieved from GitHub", mainFile.Count(), MASTER_TSV_FILE); Console.WriteLine(); } // Update column names if necessary UpdateHeaders(mainFile); var duplicateRowCount = 0; if (!options.IgnoreErrors) { // Get ids for Kegg and PubChem var keggIds = fileToAppend.GetColumnAt("kegg").Where(x => !string.IsNullOrEmpty(x)).ToList(); var cidIds = fileToAppend.GetColumnAt("pubchem cid").Where(x => !string.IsNullOrEmpty(x)).ToList(); var mainCasIds = mainFile.GetColumnAt("cas").Where(x => !string.IsNullOrEmpty(x)).ToList(); // generate PubChem and Kegg utils var pub = new PubchemUtil(cidIds.ToArray()); var kegg = new KeggUtil(keggIds.ToArray()); var file = new StreamWriter("ValidationApi.txt"); var dupRows = new DelimitedFileParser(); dupRows.SetHeaders(fileToAppend.GetHeaders()); dupRows.SetDelimiter('\t'); var warningRows = new DelimitedFileParser(); warningRows.SetHeaders(fileToAppend.GetHeaders()); warningRows.SetDelimiter('\t'); var missingKegg = new DelimitedFileParser(); missingKegg.SetHeaders(fileToAppend.GetHeaders()); missingKegg.SetDelimiter('\t'); var dataMap = fileToAppend.GetMap(); // compare fileToAppend to utils for (var i = dataMap.Count - 1; i >= 0; i--) { Compound p = null; CompoundData k = null; if (!string.IsNullOrEmpty(dataMap[i]["pubchem cid"])) { p = pub.PubChemMap[int.Parse(dataMap[i]["pubchem cid"])]; } if (!string.IsNullOrEmpty(dataMap[i]["kegg"]) && kegg.CompoundsMap.ContainsKey(dataMap[i]["kegg"])) { k = kegg.CompoundsMap[dataMap[i]["kegg"]]; } if (mainCasIds.Contains(dataMap[i]["cas"])) { dupRows.Add(dataMap[i]); fileToAppend.Remove(dataMap[i]); } else { if (k == null && CheckRow(dataMap[i], p, null)) { missingKegg.Add(dataMap[i]); } else if (!CheckRow(dataMap[i], p, k)) { // remove from list add to warning file WriteContentToFile(file, dataMap[i], p, k, warningRows.Count() + 2); warningRows.Add(dataMap[i]); fileToAppend.Remove(dataMap[i]); } } } duplicateRowCount = dupRows.Count(); file.Close(); if (fileToAppend.Count() > 0) { Console.WriteLine("Validating data file with GoodTables"); var goodTables = new GoodTables(fileToAppend.ToString(true), SchemaUrl); if (!goodTables.Response.success) { //foreach(var result in goodTables.Response.report.results) //{ // fileToAppend.Remove(result["0"].result_context[0]); //} goodTables.OutputResponse(new StreamWriter(GOOD_TABLES_WARNING_FILE)); Console.WriteLine(); Console.WriteLine("GoodTables reports errors; see " + GOOD_TABLES_WARNING_FILE); Console.WriteLine("Note that data with N/A in columns that expect a number will be flagged as an error by GoodTables; those errors can be ignored"); } } streamToFile(DUPLICATE_ROWS_FILE, dupRows); streamToFile(WARNING_ROWS_FILE, warningRows); streamToFile(MISSING_KEGG_FILE, missingKegg); if (warningRows.Count() > 0) { Console.WriteLine(); Console.WriteLine("Warnings were encountered; see file " + WARNING_ROWS_FILE); } if (missingKegg.Count() > 0) { Console.WriteLine(); Console.WriteLine("Warnings were encountered; see file " + MISSING_KEGG_FILE); } } else { Console.WriteLine(); Console.WriteLine("Ignoring validation, skipping to file upload."); } if (fileToAppend.Count() == 0) { Console.WriteLine(); Console.WriteLine("No new compounds were found; see {0} for the {1} skipped compounds", DUPLICATE_ROWS_FILE, duplicateRowCount); } else { // this will add the new data tsv to the existing tsv downloaded from github var success = mainFile.Concat(fileToAppend); if (!success) { // Concatenation of new records failed; do not upload return(false); } // Start command line process for GoodTables // // string userDirPath = Environment.GetEnvironmentVariable("GOODTABLES_PATH"); // string commandLine = $"schema \"{options.InputFile}\" --schema \"{SchemaUrl}\""; // string GoodTablesPath = $"{userDirPath}\\GoodTables"; //CommandLineProcess pro = new CommandLineProcess(GoodTablesPath, commandLine); //// if error display errors and exit //if (pro.Status.Equals(CommandLineProcess.StatusCode.Error)) //{ // Console.WriteLine($"GoodTables Validation error\n\n{pro.StandardOut}{pro.StandardError}\nExiting program please check that the data is valid."); // Console.ReadKey(); // Environment.Exit(1); //} //// if the GoodTables.exe file isn't found display message and exit //else if (pro.Status.Equals(CommandLineProcess.StatusCode.FileNotFound)) //{ // Console.WriteLine("File not found. Please make sure you have installed python and GoodTables.\n" // +"Check that the folder path for GoodTables.exe is added to an environment variable named GOODTABLES_PATH.\n" // +"Press any key to continue."); // Console.ReadKey(); // Environment.Exit(1); //} //else //{ // Console.WriteLine($"GoodTables validation\n\n{pro.StandardOut}"); // // This will send the completed tsv back to github github.SendFileAsync(mainFile.ToString(true), "data/" + MASTER_TSV_FILE); // send Agilent file to github github.SendFileAsync(mainFile.PrintAgilent(), "data/metabolitedataAgilent.tsv"); //} } return(true); } catch (Exception ex) { Console.WriteLine(); Console.WriteLine("Error processing data: " + ex.Message); Console.WriteLine(StackTraceFormatter.GetExceptionStackTraceMultiLine(ex)); return(false); } }
public CompoundData ReadKeggCompoundStream(string page) { var lines = page.Split('\n'); CompoundData entryData = null; for (var i = 0; i < lines.Length; i++) { var line = lines[i]; string[] tokens; if (line.ToLower().StartsWith("entry")) { //System.Console.WriteLine(line); tokens = line.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries); entryData = new CompoundData(tokens[1]) { Type = tokens[2] }; } if (entryData == null) { continue; } if (line.ToLower().StartsWith("name")) { tokens = line.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries); entryData.Names.Add(tokens[1]); line = lines[++i]; while (line != null && char.IsWhiteSpace(line[0])) { entryData.Names.Add(line.Trim()); line = lines[++i]; } } if (line != null && line.ToLower().StartsWith("formula")) { tokens = line.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries); entryData.Formula = tokens[1]; line = lines[++i]; } if (line != null && line.ToLower().StartsWith("exact_mass")) { tokens = line.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries); entryData.ExactMass = double.Parse(tokens[1]); line = lines[++i]; } if (line != null && line.ToLower().StartsWith("mol_weight")) { tokens = line.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries); entryData.MolecularWeight = double.Parse(tokens[1]); line = lines[++i]; } if (line != null && line.ToLower().StartsWith("comment")) { line = line.Remove(0, 7); entryData.Comment = line.Trim(); line = lines[++i]; } if (line != null && line.ToLower().StartsWith("pathway")) { line = line.Remove(0, 7); while (line != null && char.IsWhiteSpace(line[0])) { tokens = line.Trim().Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries); entryData.Pathways.Add(tokens[0]); line = lines[++i]; } } if (line != null && line.ToLower().StartsWith("dblinks")) { line = line.Remove(0, 7); while (line != null && char.IsWhiteSpace(line[0])) { tokens = line.Trim().Split(new[] { ": " }, StringSplitOptions.RemoveEmptyEntries); var identifiers = tokens[1].Trim().Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries); foreach (var identifier in identifiers) { entryData.OtherIds.Add(new KeyValuePair <string, string>(tokens[0], identifier)); } line = lines[++i]; } } } return(entryData); }