Esempio n. 1
0
        // ----------------- private methods -----------------

        // make sure that for a single molecule, the way that the hashes are created & folded is consistent with a reference
        private static void CheckFP(string molstr, CircularFingerprinterClass classType, int folding, int[] refHash)
        {
            var strType = classType == CircularFingerprinterClass.ECFP6 ? "ECFP6" : "FCFP6";

            WriteLine($"Comparing hash codes for {strType}/folding={folding}");

            var mol   = new EnumerableSDFReader(new StringReader(molstr), ChemObjectBuilder.Instance).First();
            var model = new Bayesian(classType, folding);

            model.AddMolecule(mol, false);

            var calcHash = model.Training[0];
            var same     = calcHash.Length == refHash.Length;

            if (same)
            {
                for (int n = 0; n < calcHash.Length; n++)
                {
                    if (calcHash[n] != refHash[n])
                    {
                        same = false;
                        break;
                    }
                }
            }
            if (!same)
            {
                WriteLine($"    ** calculated: {ArrayStr(calcHash)}");
                WriteLine($"    ** reference:  {ArrayStr(refHash)}");
                throw new CDKException("Hashes differ.");
            }
        }
Esempio n. 2
0
        // make sure auxiliary fields like title & comments can serialise/deserialise
        private void CheckTextFields()
        {
            WriteLine("Checking integrity of text fields");

            var dummyTitle    = "some title";
            var dummyOrigin   = "some origin";
            var dummyComments = new string[] { "comment1", "comment2" };

            var model1 = new Bayesian(CircularFingerprinterClass.ECFP6)
            {
                NoteTitle    = dummyTitle,
                NoteOrigin   = dummyOrigin,
                NoteComments = dummyComments
            };

            Bayesian model2 = null;

            try
            {
                model2 = Bayesian.Deserialise(model1.Serialise());
            }
            catch (IOException ex)
            {
                throw new CDKException("Reserialisation failed", ex);
            }

            if (!dummyTitle.Equals(model1.NoteTitle, StringComparison.Ordinal) ||
                !dummyTitle.Equals(model2.NoteTitle, StringComparison.Ordinal) ||
                !dummyOrigin.Equals(model1.NoteOrigin, StringComparison.Ordinal) ||
                !dummyOrigin.Equals(model2.NoteOrigin, StringComparison.Ordinal))
            {
                throw new CDKException("Note integrity failure for origin");
            }

            var comments1 = model1.NoteComments;
            var comments2 = model2.NoteComments;

            if (comments1.Count != dummyComments.Length ||
                comments2.Count != dummyComments.Length ||
                !comments1[0].Equals(dummyComments[0], StringComparison.Ordinal) ||
                !comments2[0].Equals(dummyComments[0], StringComparison.Ordinal) ||
                !comments1[1].Equals(dummyComments[1], StringComparison.Ordinal) ||
                !comments2[1].Equals(dummyComments[1], StringComparison.Ordinal))
            {
                throw new CDKException("Note integrity failure for origin");
            }
        }
Esempio n. 3
0
        // compares a series of molecules for folding fingerprints being literally identical
        private static void CompareFolding(string sdfile, string fpField, CircularFingerprinterClass classType, int folding)
        {
            WriteLine($"[{sdfile}] calculation of: {fpField}");

            using (var ins = ResourceLoader.GetAsStream($"NCDK.Data.CDD.{sdfile}"))
            {
                var rdr = new EnumerableSDFReader(ins, ChemObjectBuilder.Instance);

                int row = 0;
                foreach (var mol in rdr)
                {
                    row++;
                    var model = new Bayesian(classType, folding);
                    model.AddMolecule(mol, false);
                    var hashes    = model.Training[0];
                    var gotHashes = ArrayStr(hashes);
                    var reqHashes = (string)mol.GetProperties()[fpField];
                    Assert.AreEqual(reqHashes, gotHashes, $"Folded hashes do not match reference at {row}.");
                }
            }
        }
Esempio n. 4
0
        private void RunTest(string sdfile, string actvField, CircularFingerprinterClass classType, int folding, int xval, string modelFN, bool perceiveStereo)
        {
            WriteLine("[" + modelFN + "]");
            WriteLine("    Loading " + sdfile);

            try
            {
                var model = new Bayesian(classType, folding)
                {
                    PerceiveStereo = perceiveStereo
                };

                int row = 0, numActives = 0;
                using (var rdr = new EnumerableSDFReader(ResourceLoader.GetAsStream($"NCDK.Data.CDD.{sdfile}"), ChemObjectBuilder.Instance))
                {
                    foreach (var mol in rdr)
                    {
                        row++;

                        var stractv = (string)mol.GetProperties()[actvField];
                        int active  = stractv.Equals("true", StringComparison.Ordinal) ? 1 : stractv.Equals("false", StringComparison.Ordinal) ? 0 : int.Parse(stractv, NumberFormatInfo.InvariantInfo);
                        if (active != 0 && active != 1)
                        {
                            throw new CDKException("Activity field not found or invalid");
                        }

                        model.AddMolecule(mol, active == 1);
                        numActives += active;
                    }
                }

                WriteLine($"    Training with {row} rows, {numActives} actives, {(row - numActives)} inactives");

                model.Build();
                if (xval == 3)
                {
                    model.ValidateThreeFold();
                }
                else if (xval == 5)
                {
                    model.ValidateFiveFold();
                }
                else
                {
                    model.ValidateLeaveOneOut();
                }

                WriteLine($"    Validation: ROC AUC={model.RocAUC}");
                WriteLine($"    Parsing reference model");

                //FileReader frdr=new FileReader(modelFN);
                Bayesian reference;
                using (var mrdr = new StreamReader(ResourceLoader.GetAsStream($"NCDK.Data.CDD.{modelFN}")))
                {
                    reference = Bayesian.Deserialise(mrdr);
                }

                // start comparing the details...

                bool failed = false;
                if (model.Folding != reference.Folding)
                {
                    WriteLine($"    ** reference folding size={reference.Folding}");
                    failed = true;
                }
                if (model.TrainingSize != reference.TrainingSize)
                {
                    WriteLine($"    ** reference training size={reference.TrainingSize}");
                    failed = true;
                }
                if (model.TrainingActives != reference.TrainingActives)
                {
                    WriteLine($"    ** reference training actives={reference.TrainingActives}");
                    failed = true;
                }
                if (model.RocType != reference.RocType)
                {
                    WriteLine($"    ** reference ROC type={reference.RocType}");
                    failed = true;
                }
                if (!DblEqual(model.RocAUC, reference.RocAUC))
                {
                    WriteLine($"    ** reference ROC AUC={reference.RocAUC}");
                    failed = true;
                }
                if (Math.Abs(model.LowThreshold - reference.LowThreshold) > 0.00000000000001)
                {
                    WriteLine($"    ** reference lowThresh={reference.LowThreshold} different to calculated {model.LowThreshold}");
                    failed = true;
                }
                if (Math.Abs(model.HighThreshold - reference.HighThreshold) > 0.00000000000001)
                {
                    WriteLine($"    ** reference highThresh={reference.HighThreshold} different to calculated {model.HighThreshold}");
                    failed = true;
                }

                // make sure individual hash bit contributions match
                var mbits = model.Contributions;
                var rbits = reference.Contributions;
                if (mbits.Count != rbits.Count)
                {
                    WriteLine($"    ** model has {mbits.Count} contribution bits, reference has {rbits.Count}");
                    failed = true;
                }
                foreach (var h in mbits.Keys)
                {
                    if (!rbits.ContainsKey(h))
                    {
                        WriteLine($"    ** model hash bit {h} not found in reference");
                        failed = true;
                        break; // one is enough
                    }
                }
                foreach (var h in rbits.Keys)
                {
                    if (!mbits.ContainsKey(h))
                    {
                        WriteLine($"    ** reference hash bit {h} not found in model");
                        failed = true;
                        break; // one is enough
                    }
                }
                foreach (var h in mbits.Keys)
                {
                    if (rbits.ContainsKey(h))
                    {
                        double c1 = mbits[h], c2 = rbits[h];
                        if (!DblEqual(c1, c2))
                        {
                            WriteLine($"    ** contribution for bit {h}: model={c1}, reference={c2}");
                            failed = true;
                            break; // one is enough
                        }
                    }
                }

                if (failed)
                {
                    throw new CDKException("Comparison to reference failed");
                }
            }
            catch (CDKException)
            {
                throw;
            }
            catch (Exception ex)
            {
                throw new CDKException("Test failed", ex);
            }
        }
Esempio n. 5
0
        // builds a model and uses the scaled predictions to rack up a confusion matrix, for comparison
        private static void ConfirmPredictions(string sdfile, int truePos, int trueNeg, int falsePos, int falseNeg)
        {
            WriteLine($"[{sdfile}] comparing confusion matrix");

            var molecules  = new List <IAtomContainer>();
            var activities = new List <bool>();
            var model      = new Bayesian(CircularFingerprinterClass.ECFP6, 1024);

            try
            {
                using (var ins = ResourceLoader.GetAsStream("NCDK.Data.CDD." + sdfile))
                {
                    var rdr = new EnumerableSDFReader(ins, ChemObjectBuilder.Instance);

                    foreach (var mol in rdr)
                    {
                        bool actv = "true" == (string)mol.GetProperties()["Active"];
                        molecules.Add(mol);
                        activities.Add(actv);
                        model.AddMolecule(mol, actv);
                    }
                }
            }
            catch (CDKException)
            {
                throw;
            }
            catch (Exception ex)
            {
                throw new CDKException("Test failed", ex);
            }

            model.Build();
            model.ValidateLeaveOneOut();

            // build the confusion matrix
            int gotTP = 0, gotTN = 0, gotFP = 0, gotFN = 0;

            for (int n = 0; n < molecules.Count; n++)
            {
                double pred = model.ScalePredictor(model.Predict(molecules[n]));
                bool   actv = activities[n];
                if (pred >= 0.5)
                {
                    if (actv)
                    {
                        gotTP++;
                    }
                    else
                    {
                        gotFP++;
                    }
                }
                else
                {
                    if (actv)
                    {
                        gotFN++;
                    }
                    else
                    {
                        gotTN++;
                    }
                }
            }

            WriteLine("    True Positives:  got=" + gotTP + " require=" + truePos);
            WriteLine("         Negatives:  got=" + gotTN + " require=" + trueNeg);
            WriteLine("    False Positives: got=" + gotFP + " require=" + falsePos);
            WriteLine("          Negatives: got=" + gotFN + " require=" + falseNeg);

            if (gotTP != truePos || gotTN != trueNeg || gotFP != falsePos || gotFN != falseNeg)
            {
                throw new CDKException("Confusion matrix mismatch");
            }
        }