Beispiel #1
0
        // ----------------- private methods -----------------

        // make sure that for a single molecule, the way that the hashes are created & folded is consistent with a reference
        private static void CheckFP(string molstr, CircularFingerprinterClass classType, int folding, int[] refHash)
        {
            var strType = classType == CircularFingerprinterClass.ECFP6 ? "ECFP6" : "FCFP6";

            WriteLine($"Comparing hash codes for {strType}/folding={folding}");

            var mol   = new EnumerableSDFReader(new StringReader(molstr), ChemObjectBuilder.Instance).First();
            var model = new Bayesian(classType, folding);

            model.AddMolecule(mol, false);

            var calcHash = model.Training[0];
            var same     = calcHash.Length == refHash.Length;

            if (same)
            {
                for (int n = 0; n < calcHash.Length; n++)
                {
                    if (calcHash[n] != refHash[n])
                    {
                        same = false;
                        break;
                    }
                }
            }
            if (!same)
            {
                WriteLine($"    ** calculated: {ArrayStr(calcHash)}");
                WriteLine($"    ** reference:  {ArrayStr(refHash)}");
                throw new CDKException("Hashes differ.");
            }
        }
Beispiel #2
0
        // compares a series of molecules for folding fingerprints being literally identical
        private static void CompareFolding(string sdfile, string fpField, CircularFingerprinterClass classType, int folding)
        {
            WriteLine($"[{sdfile}] calculation of: {fpField}");

            using (var ins = ResourceLoader.GetAsStream($"NCDK.Data.CDD.{sdfile}"))
            {
                var rdr = new EnumerableSDFReader(ins, ChemObjectBuilder.Instance);

                int row = 0;
                foreach (var mol in rdr)
                {
                    row++;
                    var model = new Bayesian(classType, folding);
                    model.AddMolecule(mol, false);
                    var hashes    = model.Training[0];
                    var gotHashes = ArrayStr(hashes);
                    var reqHashes = (string)mol.GetProperties()[fpField];
                    Assert.AreEqual(reqHashes, gotHashes, $"Folded hashes do not match reference at {row}.");
                }
            }
        }
Beispiel #3
0
        /// <summary>
        /// Instantiate a Bayesian model with no data.
        /// </summary>
        /// <param name="classType">one of the <see cref="CircularFingerprinterClass"/> enum</param>
        /// <param name="folding">the maximum number of fingerprint bits, which must be a power of 2 (e.g. 1024, 2048) or 0 for no folding</param>
        public Bayesian(CircularFingerprinterClass classType, int folding)
        {
            this.ClassType = classType;
            this.Folding   = folding;

            // make sure the folding is valid
            bool bad = false;

            if (folding > 0)
            {
                for (int f = folding; f > 0; f = f >> 1)
                {
                    if ((f & 1) == 1 && f != 1)
                    {
                        bad = true;
                        break;
                    }
                }
            }
            if (folding < 0 || bad)
            {
                throw new ArithmeticException("Fingerprint folding " + folding + " invalid: must be 0 or power of 2.");
            }
        }
Beispiel #4
0
        private void RunTest(string sdfile, string actvField, CircularFingerprinterClass classType, int folding, int xval, string modelFN, bool perceiveStereo)
        {
            WriteLine("[" + modelFN + "]");
            WriteLine("    Loading " + sdfile);

            try
            {
                var model = new Bayesian(classType, folding)
                {
                    PerceiveStereo = perceiveStereo
                };

                int row = 0, numActives = 0;
                using (var rdr = new EnumerableSDFReader(ResourceLoader.GetAsStream($"NCDK.Data.CDD.{sdfile}"), ChemObjectBuilder.Instance))
                {
                    foreach (var mol in rdr)
                    {
                        row++;

                        var stractv = (string)mol.GetProperties()[actvField];
                        int active  = stractv.Equals("true", StringComparison.Ordinal) ? 1 : stractv.Equals("false", StringComparison.Ordinal) ? 0 : int.Parse(stractv, NumberFormatInfo.InvariantInfo);
                        if (active != 0 && active != 1)
                        {
                            throw new CDKException("Activity field not found or invalid");
                        }

                        model.AddMolecule(mol, active == 1);
                        numActives += active;
                    }
                }

                WriteLine($"    Training with {row} rows, {numActives} actives, {(row - numActives)} inactives");

                model.Build();
                if (xval == 3)
                {
                    model.ValidateThreeFold();
                }
                else if (xval == 5)
                {
                    model.ValidateFiveFold();
                }
                else
                {
                    model.ValidateLeaveOneOut();
                }

                WriteLine($"    Validation: ROC AUC={model.RocAUC}");
                WriteLine($"    Parsing reference model");

                //FileReader frdr=new FileReader(modelFN);
                Bayesian reference;
                using (var mrdr = new StreamReader(ResourceLoader.GetAsStream($"NCDK.Data.CDD.{modelFN}")))
                {
                    reference = Bayesian.Deserialise(mrdr);
                }

                // start comparing the details...

                bool failed = false;
                if (model.Folding != reference.Folding)
                {
                    WriteLine($"    ** reference folding size={reference.Folding}");
                    failed = true;
                }
                if (model.TrainingSize != reference.TrainingSize)
                {
                    WriteLine($"    ** reference training size={reference.TrainingSize}");
                    failed = true;
                }
                if (model.TrainingActives != reference.TrainingActives)
                {
                    WriteLine($"    ** reference training actives={reference.TrainingActives}");
                    failed = true;
                }
                if (model.RocType != reference.RocType)
                {
                    WriteLine($"    ** reference ROC type={reference.RocType}");
                    failed = true;
                }
                if (!DblEqual(model.RocAUC, reference.RocAUC))
                {
                    WriteLine($"    ** reference ROC AUC={reference.RocAUC}");
                    failed = true;
                }
                if (Math.Abs(model.LowThreshold - reference.LowThreshold) > 0.00000000000001)
                {
                    WriteLine($"    ** reference lowThresh={reference.LowThreshold} different to calculated {model.LowThreshold}");
                    failed = true;
                }
                if (Math.Abs(model.HighThreshold - reference.HighThreshold) > 0.00000000000001)
                {
                    WriteLine($"    ** reference highThresh={reference.HighThreshold} different to calculated {model.HighThreshold}");
                    failed = true;
                }

                // make sure individual hash bit contributions match
                var mbits = model.Contributions;
                var rbits = reference.Contributions;
                if (mbits.Count != rbits.Count)
                {
                    WriteLine($"    ** model has {mbits.Count} contribution bits, reference has {rbits.Count}");
                    failed = true;
                }
                foreach (var h in mbits.Keys)
                {
                    if (!rbits.ContainsKey(h))
                    {
                        WriteLine($"    ** model hash bit {h} not found in reference");
                        failed = true;
                        break; // one is enough
                    }
                }
                foreach (var h in rbits.Keys)
                {
                    if (!mbits.ContainsKey(h))
                    {
                        WriteLine($"    ** reference hash bit {h} not found in model");
                        failed = true;
                        break; // one is enough
                    }
                }
                foreach (var h in mbits.Keys)
                {
                    if (rbits.ContainsKey(h))
                    {
                        double c1 = mbits[h], c2 = rbits[h];
                        if (!DblEqual(c1, c2))
                        {
                            WriteLine($"    ** contribution for bit {h}: model={c1}, reference={c2}");
                            failed = true;
                            break; // one is enough
                        }
                    }
                }

                if (failed)
                {
                    throw new CDKException("Comparison to reference failed");
                }
            }
            catch (CDKException)
            {
                throw;
            }
            catch (Exception ex)
            {
                throw new CDKException("Test failed", ex);
            }
        }
Beispiel #5
0
 // performs a bulk test: loads an SDfile, builds a model with the given parameters, and compares it to a reference model
 // that has been previously serialised
 private void RunTest(string sdfile, string actvField, CircularFingerprinterClass classType, int folding, int xval, string modelFN)
 {
     RunTest(sdfile, actvField, classType, folding, xval, modelFN, false);
 }
Beispiel #6
0
        // ----------------- public methods -----------------

        /// <summary>
        /// Instantiate a Bayesian model with no data.
        /// </summary>
        /// <param name="classType">one of the CircularFingerprinter.CLASS_* constants</param>
        public Bayesian(CircularFingerprinterClass classType)
        {
            this.ClassType = classType;
        }
Beispiel #7
0
        /// <summary>
        /// BuildTest
        /// </summary>
        public static void BuildTest()
        {
            CircularFingerprinter      cfp     = null;
            CircularFingerprinterClass FpClass = CircularFingerprinterClass.ECFP6; // FP diameter
            int FpLen = 2048;                                                      // folded binary fp length

            IAtomContainer mol, mol2;

            //string molfile = FileUtil.ReadFile(@"C:\Download\CorpId-12345.mol");
            //java.io.StringReader sr = new java.io.StringReader(molfile);
            //if (Lex.Contains(molfile, "v2000"))
            //  cor = new MDLV2000Reader(sr);
            //else
            //  cor = new MDLV3000Reader(sr);

            //cor.setReaderMode(IChemObjectReader.Mode.RELAXED);

            //ac = (IAtomContainer)cor.read(new AtomContainer());
            //cor.close();

            FpClass = CircularFingerprinterClass.ECFP4;             // debug

            cfp = new CircularFingerprinter(FpClass, FpLen);

            StreamReader reader = new StreamReader(@"C:\Download\CorpId-12345.mol");
            //FileReader FileReader = new FileReader(@"C:\Download\V3000 Mols.sdf");

            EnumerableSDFReader rdr = new EnumerableSDFReader(reader.BaseStream, ChemObjectBuilder.Instance);

            rdr.ReaderMode = ChemObjectReaderMode.Relaxed;
            IEnumerator <IAtomContainer> cursor = rdr.GetEnumerator();

            while (cursor.MoveNext())
            {
                mol = cursor.Current;

                mol = CdkMol.GetLargestMoleculeFragment(mol);

                ICountFingerprint cfp1 = cfp.GetCountFingerprint(mol);                 // get hash values and counts for each

                cfp.Calculate(mol);
                int fpCount = cfp.FPCount;
                for (int fpi = 0; fpi < fpCount; fpi++)                 // gets
                {
                    CircularFingerprint cfp2 = cfp.GetFP(fpi);          // gets hash, iteration and lists of atoms (dups appear multiple times)
                }

                IBitFingerprint bfp    = cfp.GetBitFingerprint(mol);
                BitArray        bs     = bfp.AsBitSet();
                int             bsCard = bfp.Cardinality;
                long            bsSize = bfp.Length;
                continue;
            }

            reader.Close();

            return;


            //java.io.StringReader sr = new java.io.StringReader(molfile);
            //AtomContainer mol = new AtomContainer();

            //mol.addAtom(new Atom("C"));
            //mol.addAtom(new Atom("H"));
            //mol.addAtom(new Atom("H"));
            //mol.addAtom(new Atom("H"));
            //mol.addAtom(new Atom("H"));
            //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(1)));
            //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(2)));
            //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(3)));
            //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(4)));

            //FileReader FileReader = new FileReader(@"C:\Download\CorpId-12345.mol");
            //MolReader mr = new MolReader(FileReader, DefaultChemObjectBuilder.getInstance());
            //java.io.StringReader sr = new java.io.StringReader(molfile);
            //IMol m = (IMol)mr.next();
            //FileReader.close();
        }
Beispiel #8
0
        /// <summary>
        /// Build fingerprint
        /// </summary>
        /// <param name="mol"></param>

        public static BitSetFingerprint BuildBitSetFingerprint(
            IAtomContainer mol,
            FingerprintType fpType,
            int fpSubtype = -1,
            int fpLen     = -1)
        {
            // Data for Tanimoto similarity using various fingerprint types for CorpId 123456 query.
            // Cart - Standard MDL Oracle Cartridge scores
            //
            //                         Similarity Score
            //         ------------------------------------------------
            // Size ->        192    896   1024  1024   128  1024   320
            // CorpId     Cart  MACCS  PbChm  ECFP4 EXT  EState Basic Sbstr
            // ------  ----  ----   ----   ----  ----  ----	 ----  ----
            // 123456  0.99  1.00   1.00   1.00  1.00  1.00  1.00  1.00
            // 123456  0.99  0.98   0.96     0.77  0.95  1.00  0.95  1.00
            // 123456  0.99  0.98   0.96     0.77  0.95  1.00  0.94  1.00
            // 123456  0.99  1.00   1.00     1.00  1.00  1.00  1.00  1.00
            // 123456  0.99  1.00   1.00     1.00  1.00  1.00  1.00  1.00
            // 123456  0.99  0.91   1.00     0.81  1.00  1.00  1.00  1.00
            // 123456  0.98  0.95   1.00     0.74  0.92  1.00  0.93  0.94
            // 123456  0.98  1.00   1.00     1.00  1.00  1.00  1.00  1.00
            // 123456  0.98  1.00   1.00     1.00  1.00  1.00  1.00  1.00
            // 123456  0.98  1.00   0.83   0.76  0.77  0.90  0.76  0.94


            // LSH Bin Count - The number of LSH bins (of 25) that match the query bin values
            //--------------
            // CorpId     MAC  PbC ECFP EX
            // ------  ---  ---  --- ---
            // 123456   25   25   25  25
            // 123456	  25   20    7  16
            // 123456	  25   20    9  19
            // 123456	  25   25   25  25
            // 123456	  25   25   25  25
            // 123456	  20   25    9  25
            // 123456	  21   25   11  17
            // 123456	  25   25   25  25
            // 123456	  25   25   25  25
            // 123456	  25    9    6  11

            // Data for Tanimoto similarity using various Circular fingerprint types.
            // Using 2 molecules where the 2nd just has an added methyl group.
            //
            //  Measure      Score
            //  --------     -----
            //  ECFP0        1.00
            //  ECFP2         .88
            //  ECFP4         .75
            //  ECFP6         .64
            //  FCFP0        1.00
            //  FCFP2         .92
            //  FCFP4         .84
            //  FCFP6         .74

            IFingerprinter    ifptr = null;
            IBitFingerprint   ibfp  = null;
            BitSetFingerprint bfp   = null;
            IAtomContainer    mol2;
            string            s = "";

            DateTime t0 = DateTime.Now;
            double   getFptrTime = 0, buildFpTime = 0;

            if (fpType == FingerprintType.Basic)             // size = 1024
            {
                ifptr = new Fingerprinter();
            }

            else if (fpType == FingerprintType.Circular)             // size variable
            {
                CircularFingerprinterClass cfpClass = (CircularFingerprinterClass)fpSubtype;
                if (cfpClass < CircularFingerprinterClass.ECFP0 || cfpClass > CircularFingerprinterClass.ECFP6)
                {
                    cfpClass = (CircularFingerprinterClass)CircularFingerprintType.DefaultCircularClass;                     // default class
                }
                if (fpLen < 0)
                {
                    fpLen = CircularFingerprintType.DefaultCircularLength;                            // default length
                }
                ifptr = new CircularFingerprinter(cfpClass, fpLen);

                //CircularFingerprinter cfp = (CircularFingerprinter)ifptr;
                //ICountFingerprint cntFp = cfp.getCountFingerprint(mol); // debug
                //s = CircularFpToString(cfp); // debug
            }

            else if (fpType == FingerprintType.Extended)             // size = 1024
            {
                ifptr = new ExtendedFingerprinter();                 // use DEFAULT_SIZE and DEFAULT_SEARCH_DEPTH
            }

            else if (fpType == FingerprintType.EState)             // size = 128
            {
                ifptr = new EStateFingerprinter();                 // use DEFAULT_SIZE and DEFAULT_SEARCH_DEPTH
            }

            else if (fpType == FingerprintType.MACCS)             // size = 192
            {
                if (MACCSFp == null)
                {
                    MACCSFp = new MACCSFingerprinter();
                }

                ifptr = MACCSFp;
            }

            else if (fpType == FingerprintType.PubChem)             // size = 896
            {
                //IChemObjectBuilder builder = DefaultChemObjectBuilder.getInstance();
                ifptr = new PubchemFingerprinter();
            }

            else if (fpType == FingerprintType.ShortestPath)             // size =
            {
                ifptr = new ShortestPathFingerprinter();                 // fails with atom type issue for many structures (e.g. 123456)
            }

            else if (fpType == FingerprintType.Signature)             // size =
            {
                ifptr = new SignatureFingerprinter();                 // can't convert array fingerprint to bitsetfingerprint
            }

            else if (fpType == FingerprintType.Substructure)             // size = 320
            {
                ifptr = new SubstructureFingerprinter();
            }

            else
            {
                throw new Exception("Invalid CdkFingerprintType: " + fpType);
            }

            getFptrTime = TimeOfDay.Delta(ref t0);

            ibfp = ifptr.GetBitFingerprint(mol);
            bfp  = (BitSetFingerprint)ibfp;

            buildFpTime = TimeOfDay.Delta(ref t0);

            //long size = bfp.size();
            //int card = bfp.Cardinality;
            return(bfp);
        }