// ----------------- private methods ----------------- // make sure that for a single molecule, the way that the hashes are created & folded is consistent with a reference private static void CheckFP(string molstr, CircularFingerprinterClass classType, int folding, int[] refHash) { var strType = classType == CircularFingerprinterClass.ECFP6 ? "ECFP6" : "FCFP6"; WriteLine($"Comparing hash codes for {strType}/folding={folding}"); var mol = new EnumerableSDFReader(new StringReader(molstr), ChemObjectBuilder.Instance).First(); var model = new Bayesian(classType, folding); model.AddMolecule(mol, false); var calcHash = model.Training[0]; var same = calcHash.Length == refHash.Length; if (same) { for (int n = 0; n < calcHash.Length; n++) { if (calcHash[n] != refHash[n]) { same = false; break; } } } if (!same) { WriteLine($" ** calculated: {ArrayStr(calcHash)}"); WriteLine($" ** reference: {ArrayStr(refHash)}"); throw new CDKException("Hashes differ."); } }
// make sure auxiliary fields like title & comments can serialise/deserialise private void CheckTextFields() { WriteLine("Checking integrity of text fields"); var dummyTitle = "some title"; var dummyOrigin = "some origin"; var dummyComments = new string[] { "comment1", "comment2" }; var model1 = new Bayesian(CircularFingerprinterClass.ECFP6) { NoteTitle = dummyTitle, NoteOrigin = dummyOrigin, NoteComments = dummyComments }; Bayesian model2 = null; try { model2 = Bayesian.Deserialise(model1.Serialise()); } catch (IOException ex) { throw new CDKException("Reserialisation failed", ex); } if (!dummyTitle.Equals(model1.NoteTitle, StringComparison.Ordinal) || !dummyTitle.Equals(model2.NoteTitle, StringComparison.Ordinal) || !dummyOrigin.Equals(model1.NoteOrigin, StringComparison.Ordinal) || !dummyOrigin.Equals(model2.NoteOrigin, StringComparison.Ordinal)) { throw new CDKException("Note integrity failure for origin"); } var comments1 = model1.NoteComments; var comments2 = model2.NoteComments; if (comments1.Count != dummyComments.Length || comments2.Count != dummyComments.Length || !comments1[0].Equals(dummyComments[0], StringComparison.Ordinal) || !comments2[0].Equals(dummyComments[0], StringComparison.Ordinal) || !comments1[1].Equals(dummyComments[1], StringComparison.Ordinal) || !comments2[1].Equals(dummyComments[1], StringComparison.Ordinal)) { throw new CDKException("Note integrity failure for origin"); } }
// compares a series of molecules for folding fingerprints being literally identical private static void CompareFolding(string sdfile, string fpField, CircularFingerprinterClass classType, int folding) { WriteLine($"[{sdfile}] calculation of: {fpField}"); using (var ins = ResourceLoader.GetAsStream($"NCDK.Data.CDD.{sdfile}")) { var rdr = new EnumerableSDFReader(ins, ChemObjectBuilder.Instance); int row = 0; foreach (var mol in rdr) { row++; var model = new Bayesian(classType, folding); model.AddMolecule(mol, false); var hashes = model.Training[0]; var gotHashes = ArrayStr(hashes); var reqHashes = (string)mol.GetProperties()[fpField]; Assert.AreEqual(reqHashes, gotHashes, $"Folded hashes do not match reference at {row}."); } } }
private void RunTest(string sdfile, string actvField, CircularFingerprinterClass classType, int folding, int xval, string modelFN, bool perceiveStereo) { WriteLine("[" + modelFN + "]"); WriteLine(" Loading " + sdfile); try { var model = new Bayesian(classType, folding) { PerceiveStereo = perceiveStereo }; int row = 0, numActives = 0; using (var rdr = new EnumerableSDFReader(ResourceLoader.GetAsStream($"NCDK.Data.CDD.{sdfile}"), ChemObjectBuilder.Instance)) { foreach (var mol in rdr) { row++; var stractv = (string)mol.GetProperties()[actvField]; int active = stractv.Equals("true", StringComparison.Ordinal) ? 1 : stractv.Equals("false", StringComparison.Ordinal) ? 0 : int.Parse(stractv, NumberFormatInfo.InvariantInfo); if (active != 0 && active != 1) { throw new CDKException("Activity field not found or invalid"); } model.AddMolecule(mol, active == 1); numActives += active; } } WriteLine($" Training with {row} rows, {numActives} actives, {(row - numActives)} inactives"); model.Build(); if (xval == 3) { model.ValidateThreeFold(); } else if (xval == 5) { model.ValidateFiveFold(); } else { model.ValidateLeaveOneOut(); } WriteLine($" Validation: ROC AUC={model.RocAUC}"); WriteLine($" Parsing reference model"); //FileReader frdr=new FileReader(modelFN); Bayesian reference; using (var mrdr = new StreamReader(ResourceLoader.GetAsStream($"NCDK.Data.CDD.{modelFN}"))) { reference = Bayesian.Deserialise(mrdr); } // start comparing the details... bool failed = false; if (model.Folding != reference.Folding) { WriteLine($" ** reference folding size={reference.Folding}"); failed = true; } if (model.TrainingSize != reference.TrainingSize) { WriteLine($" ** reference training size={reference.TrainingSize}"); failed = true; } if (model.TrainingActives != reference.TrainingActives) { WriteLine($" ** reference training actives={reference.TrainingActives}"); failed = true; } if (model.RocType != reference.RocType) { WriteLine($" ** reference ROC type={reference.RocType}"); failed = true; } if (!DblEqual(model.RocAUC, reference.RocAUC)) { WriteLine($" ** reference ROC AUC={reference.RocAUC}"); failed = true; } if (Math.Abs(model.LowThreshold - reference.LowThreshold) > 0.00000000000001) { WriteLine($" ** reference lowThresh={reference.LowThreshold} different to calculated {model.LowThreshold}"); failed = true; } if (Math.Abs(model.HighThreshold - reference.HighThreshold) > 0.00000000000001) { WriteLine($" ** reference highThresh={reference.HighThreshold} different to calculated {model.HighThreshold}"); failed = true; } // make sure individual hash bit contributions match var mbits = model.Contributions; var rbits = reference.Contributions; if (mbits.Count != rbits.Count) { WriteLine($" ** model has {mbits.Count} contribution bits, reference has {rbits.Count}"); failed = true; } foreach (var h in mbits.Keys) { if (!rbits.ContainsKey(h)) { WriteLine($" ** model hash bit {h} not found in reference"); failed = true; break; // one is enough } } foreach (var h in rbits.Keys) { if (!mbits.ContainsKey(h)) { WriteLine($" ** reference hash bit {h} not found in model"); failed = true; break; // one is enough } } foreach (var h in mbits.Keys) { if (rbits.ContainsKey(h)) { double c1 = mbits[h], c2 = rbits[h]; if (!DblEqual(c1, c2)) { WriteLine($" ** contribution for bit {h}: model={c1}, reference={c2}"); failed = true; break; // one is enough } } } if (failed) { throw new CDKException("Comparison to reference failed"); } } catch (CDKException) { throw; } catch (Exception ex) { throw new CDKException("Test failed", ex); } }
// builds a model and uses the scaled predictions to rack up a confusion matrix, for comparison private static void ConfirmPredictions(string sdfile, int truePos, int trueNeg, int falsePos, int falseNeg) { WriteLine($"[{sdfile}] comparing confusion matrix"); var molecules = new List <IAtomContainer>(); var activities = new List <bool>(); var model = new Bayesian(CircularFingerprinterClass.ECFP6, 1024); try { using (var ins = ResourceLoader.GetAsStream("NCDK.Data.CDD." + sdfile)) { var rdr = new EnumerableSDFReader(ins, ChemObjectBuilder.Instance); foreach (var mol in rdr) { bool actv = "true" == (string)mol.GetProperties()["Active"]; molecules.Add(mol); activities.Add(actv); model.AddMolecule(mol, actv); } } } catch (CDKException) { throw; } catch (Exception ex) { throw new CDKException("Test failed", ex); } model.Build(); model.ValidateLeaveOneOut(); // build the confusion matrix int gotTP = 0, gotTN = 0, gotFP = 0, gotFN = 0; for (int n = 0; n < molecules.Count; n++) { double pred = model.ScalePredictor(model.Predict(molecules[n])); bool actv = activities[n]; if (pred >= 0.5) { if (actv) { gotTP++; } else { gotFP++; } } else { if (actv) { gotFN++; } else { gotTN++; } } } WriteLine(" True Positives: got=" + gotTP + " require=" + truePos); WriteLine(" Negatives: got=" + gotTN + " require=" + trueNeg); WriteLine(" False Positives: got=" + gotFP + " require=" + falsePos); WriteLine(" Negatives: got=" + gotFN + " require=" + falseNeg); if (gotTP != truePos || gotTN != trueNeg || gotFP != falsePos || gotFN != falseNeg) { throw new CDKException("Confusion matrix mismatch"); } }