private static void CheckFPSmartsForMolecule(string moleculeSmiles, string[][] expectedFPSmarts)
        {
            var expected = new HashSet <string>();

            foreach (var strs in expectedFPSmarts)
            {
                foreach (var str in strs)
                {
                    expected.Add(str);
                }
            }

            // expectedFPSmarts[][] is a double array because for each smarts
            // several equivalent variants
            // of the smarts are given e.g. CCC C(C)C
            var mol = parser.ParseSmiles(moleculeSmiles);

            CircularFingerprinter circ = new CircularFingerprinter();

            circ.Calculate(mol);
            var subsmarts = new SmartsFragmentExtractor(mol);

            subsmarts.SetMode(SubstructureSelectionMode.JCompoundMapper);
            var numFP = circ.FPCount;

            var actual = new HashSet <string>();

            for (int i = 0; i < numFP; i++)
            {
                var fp = circ.GetFP(i);
                actual.Add(subsmarts.Generate(fp.Atoms));
            }

            Assert.IsTrue(expected.IsSupersetOf(actual));
        }
Esempio n. 2
0
        public static string CircularFpToString(CircularFingerprinter cfp)
        {
            CircularFingerprint fp = null;

            string s = "fp\thashCode\titeration\tatoms\r\n";

            int fpCount = cfp.FPCount;

            for (int fpi = 0; fpi < fpCount; fpi++)
            {
                fp = cfp.GetFP(fpi);
                s += fpi.ToString() + "\t" + fp.Hash + "\t" + fp.Iteration + "\t(" + string.Join(", ", fp.Atoms) + ")\r\n";
            }

            return(s);
        }
Esempio n. 3
0
        /// <summary>
        /// Appends a new row to the model source data, which consists of a molecule and whether or not it
        /// is considered active.
        /// </summary>
        /// <param name="mol">molecular structure, which must be non-blank</param>
        /// <param name="active">whether active or not</param>
        public void AddMolecule(IAtomContainer mol, bool active)
        {
            if (mol == null || mol.Atoms.Count == 0)
            {
                throw new CDKException("Molecule cannot be blank or null.");
            }

            var circ = new CircularFingerprinter(ClassType)
            {
                PerceiveStereo = this.PerceiveStereo
            };

            circ.Calculate(mol);

            // gather all of the (folded) fingerprints into a sorted set
            int AND_BITS = Folding - 1; // e.g. 1024/0x400 -> 1023/0x3FF: chop off higher order bits
            var hashset  = new SortedSet <int>();

            for (int n = circ.FPCount - 1; n >= 0; n--)
            {
                int code = circ.GetFP(n).Hash;
                if (Folding > 0)
                {
                    code &= AND_BITS;
                }
                hashset.Add(code);
            }

            // convert the set into a sorted primitive array
            var hashes = new int[hashset.Count];
            int p      = 0;

            foreach (var h in hashset)
            {
                hashes[p++] = h;
            }

            // record the processed information for model building purposes
            if (active)
            {
                numActive++;
            }
            training.Add(hashes);
            activity.Add(active);
            foreach (var h in hashes)
            {
                if (!inHash.TryGetValue(h, out int[] stash))
Esempio n. 4
0
        public void TestGetBitFingerprint()
        {
            Assert.IsTrue(trivialMol != null);
            var circ   = new CircularFingerprinter();
            var result = circ.GetBitFingerprint(trivialMol);

            BitArray wantBits = new BitArray(0), gotBits = result.AsBitSet();

            int[] REQUIRE_BITS = { 19, 152, 293, 340, 439, 480, 507, 726, 762, 947, 993 };
            foreach (var b in REQUIRE_BITS)
            {
                BitArrays.SetValue(wantBits, b, true);
            }
            if (!BitArrays.Equals(wantBits, gotBits))
            {
                throw new CDKException("Got " + gotBits + ", wanted " + wantBits);
            }
        }
Esempio n. 5
0
        public void TestUseStereoElements()
        {
            const string smiles1 = "CC[C@@H](C)O";
            const string smiles2 = "CC[C@H](O)C";
            const string molfile = "\n"
                                   + "  CDK     10121722462D          \n"
                                   + "\n"
                                   + "  5  4  0  0  0  0            999 V2000\n"
                                   + "   -4.1837    2.6984    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n"
                                   + "   -3.4692    3.1109    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n"
                                   + "   -2.7547    2.6984    0.0000 C   0  0  1  0  0  0  0  0  0  0  0  0\n"
                                   + "   -2.0403    3.1109    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n"
                                   + "   -2.7547    1.8734    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0\n"
                                   + "  1  2  1  0  0  0  0\n"
                                   + "  2  3  1  0  0  0  0\n"
                                   + "  3  4  1  0  0  0  0\n"
                                   + "  3  5  1  1  0  0  0\n"
                                   + "M  END\n";
            IChemObjectBuilder bldr   = CDK.Builder;
            MDLV2000Reader     mdlr   = new MDLV2000Reader(new StringReader(molfile));
            SmilesParser       smipar = new SmilesParser(bldr);

            var            mol1 = smipar.ParseSmiles(smiles1);
            var            mol2 = smipar.ParseSmiles(smiles2);
            IAtomContainer mol3 = mdlr.Read(bldr.NewAtomContainer());

            var fpr = new CircularFingerprinter
            {
                // when stereo-chemistry is perceived we don't have coordinates from the
                // SMILES and so get a different fingerprint
                PerceiveStereo = true
            };

            Assert.IsTrue(Compares.AreEqual(fpr.GetFingerprint(mol1), fpr.GetFingerprint(mol2)));
            Assert.IsFalse(Compares.AreEqual(fpr.GetFingerprint(mol2), fpr.GetFingerprint(mol3)));

            fpr.PerceiveStereo = false;
            Assert.IsTrue(Compares.AreEqual(fpr.GetFingerprint(mol1), fpr.GetFingerprint(mol2)));
            Assert.IsTrue(Compares.AreEqual(fpr.GetFingerprint(mol2), fpr.GetFingerprint(mol3)));
        }
Esempio n. 6
0
        public void TestGetCountFingerprint()
        {
            Assert.IsTrue(trivialMol != null);
            var circ   = new CircularFingerprinter();
            var result = circ.GetCountFingerprint(trivialMol);

            int[] ANSWER_KEY = { -414937772,  1, -1027418143, 1, 1627608083, 1, -868007456, 1, -1006701866, 1,
                                 -1059145289, 1,  -801752141, 1,  790592664, 1, -289109509, 1, -1650154758, 1,1286833445, 1 };

            int  wantBits = ANSWER_KEY.Length >> 1;
            bool fail     = result.GetNumberOfPopulatedBins() != wantBits;

            for (int n = 0; !fail && n < result.GetNumberOfPopulatedBins(); n++)
            {
                int  gotHash = result.GetHash(n), gotCount = result.GetCount(n);
                bool found = false;
                for (int i = 0; i < wantBits; i++)
                {
                    int wantHash = ANSWER_KEY[i * 2], wantCount = ANSWER_KEY[i * 2 + 1];
                    if (gotHash == wantHash)
                    {
                        found = true;
                        if (gotCount != wantCount)
                        {
                            throw new CDKException("For hash " + gotHash + " got count " + gotCount + " but wanted " + wantCount);
                        }
                    }
                }
                if (!found)
                {
                    fail = true;
                    break;
                }
            }
            if (fail)
            {
                throw new CDKException("Hash values do not match.");
            }
        }
Esempio n. 7
0
        /// <summary>
        /// BuildTest
        /// </summary>
        public static void BuildTest()
        {
            CircularFingerprinter      cfp     = null;
            CircularFingerprinterClass FpClass = CircularFingerprinterClass.ECFP6; // FP diameter
            int FpLen = 2048;                                                      // folded binary fp length

            IAtomContainer mol, mol2;

            //string molfile = FileUtil.ReadFile(@"C:\Download\CorpId-12345.mol");
            //java.io.StringReader sr = new java.io.StringReader(molfile);
            //if (Lex.Contains(molfile, "v2000"))
            //  cor = new MDLV2000Reader(sr);
            //else
            //  cor = new MDLV3000Reader(sr);

            //cor.setReaderMode(IChemObjectReader.Mode.RELAXED);

            //ac = (IAtomContainer)cor.read(new AtomContainer());
            //cor.close();

            FpClass = CircularFingerprinterClass.ECFP4;             // debug

            cfp = new CircularFingerprinter(FpClass, FpLen);

            StreamReader reader = new StreamReader(@"C:\Download\CorpId-12345.mol");
            //FileReader FileReader = new FileReader(@"C:\Download\V3000 Mols.sdf");

            EnumerableSDFReader rdr = new EnumerableSDFReader(reader.BaseStream, ChemObjectBuilder.Instance);

            rdr.ReaderMode = ChemObjectReaderMode.Relaxed;
            IEnumerator <IAtomContainer> cursor = rdr.GetEnumerator();

            while (cursor.MoveNext())
            {
                mol = cursor.Current;

                mol = CdkMol.GetLargestMoleculeFragment(mol);

                ICountFingerprint cfp1 = cfp.GetCountFingerprint(mol);                 // get hash values and counts for each

                cfp.Calculate(mol);
                int fpCount = cfp.FPCount;
                for (int fpi = 0; fpi < fpCount; fpi++)                 // gets
                {
                    CircularFingerprint cfp2 = cfp.GetFP(fpi);          // gets hash, iteration and lists of atoms (dups appear multiple times)
                }

                IBitFingerprint bfp    = cfp.GetBitFingerprint(mol);
                BitArray        bs     = bfp.AsBitSet();
                int             bsCard = bfp.Cardinality;
                long            bsSize = bfp.Length;
                continue;
            }

            reader.Close();

            return;


            //java.io.StringReader sr = new java.io.StringReader(molfile);
            //AtomContainer mol = new AtomContainer();

            //mol.addAtom(new Atom("C"));
            //mol.addAtom(new Atom("H"));
            //mol.addAtom(new Atom("H"));
            //mol.addAtom(new Atom("H"));
            //mol.addAtom(new Atom("H"));
            //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(1)));
            //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(2)));
            //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(3)));
            //mol.addBond(new Bond(mol.getAtom(0), mol.getAtom(4)));

            //FileReader FileReader = new FileReader(@"C:\Download\CorpId-12345.mol");
            //MolReader mr = new MolReader(FileReader, DefaultChemObjectBuilder.getInstance());
            //java.io.StringReader sr = new java.io.StringReader(molfile);
            //IMol m = (IMol)mr.next();
            //FileReader.close();
        }
Esempio n. 8
0
        /// <summary>
        /// Build fingerprint
        /// </summary>
        /// <param name="mol"></param>

        public static BitSetFingerprint BuildBitSetFingerprint(
            IAtomContainer mol,
            FingerprintType fpType,
            int fpSubtype = -1,
            int fpLen     = -1)
        {
            // Data for Tanimoto similarity using various fingerprint types for CorpId 123456 query.
            // Cart - Standard MDL Oracle Cartridge scores
            //
            //                         Similarity Score
            //         ------------------------------------------------
            // Size ->        192    896   1024  1024   128  1024   320
            // CorpId     Cart  MACCS  PbChm  ECFP4 EXT  EState Basic Sbstr
            // ------  ----  ----   ----   ----  ----  ----	 ----  ----
            // 123456  0.99  1.00   1.00   1.00  1.00  1.00  1.00  1.00
            // 123456  0.99  0.98   0.96     0.77  0.95  1.00  0.95  1.00
            // 123456  0.99  0.98   0.96     0.77  0.95  1.00  0.94  1.00
            // 123456  0.99  1.00   1.00     1.00  1.00  1.00  1.00  1.00
            // 123456  0.99  1.00   1.00     1.00  1.00  1.00  1.00  1.00
            // 123456  0.99  0.91   1.00     0.81  1.00  1.00  1.00  1.00
            // 123456  0.98  0.95   1.00     0.74  0.92  1.00  0.93  0.94
            // 123456  0.98  1.00   1.00     1.00  1.00  1.00  1.00  1.00
            // 123456  0.98  1.00   1.00     1.00  1.00  1.00  1.00  1.00
            // 123456  0.98  1.00   0.83   0.76  0.77  0.90  0.76  0.94


            // LSH Bin Count - The number of LSH bins (of 25) that match the query bin values
            //--------------
            // CorpId     MAC  PbC ECFP EX
            // ------  ---  ---  --- ---
            // 123456   25   25   25  25
            // 123456	  25   20    7  16
            // 123456	  25   20    9  19
            // 123456	  25   25   25  25
            // 123456	  25   25   25  25
            // 123456	  20   25    9  25
            // 123456	  21   25   11  17
            // 123456	  25   25   25  25
            // 123456	  25   25   25  25
            // 123456	  25    9    6  11

            // Data for Tanimoto similarity using various Circular fingerprint types.
            // Using 2 molecules where the 2nd just has an added methyl group.
            //
            //  Measure      Score
            //  --------     -----
            //  ECFP0        1.00
            //  ECFP2         .88
            //  ECFP4         .75
            //  ECFP6         .64
            //  FCFP0        1.00
            //  FCFP2         .92
            //  FCFP4         .84
            //  FCFP6         .74

            IFingerprinter    ifptr = null;
            IBitFingerprint   ibfp  = null;
            BitSetFingerprint bfp   = null;
            IAtomContainer    mol2;
            string            s = "";

            DateTime t0 = DateTime.Now;
            double   getFptrTime = 0, buildFpTime = 0;

            if (fpType == FingerprintType.Basic)             // size = 1024
            {
                ifptr = new Fingerprinter();
            }

            else if (fpType == FingerprintType.Circular)             // size variable
            {
                CircularFingerprinterClass cfpClass = (CircularFingerprinterClass)fpSubtype;
                if (cfpClass < CircularFingerprinterClass.ECFP0 || cfpClass > CircularFingerprinterClass.ECFP6)
                {
                    cfpClass = (CircularFingerprinterClass)CircularFingerprintType.DefaultCircularClass;                     // default class
                }
                if (fpLen < 0)
                {
                    fpLen = CircularFingerprintType.DefaultCircularLength;                            // default length
                }
                ifptr = new CircularFingerprinter(cfpClass, fpLen);

                //CircularFingerprinter cfp = (CircularFingerprinter)ifptr;
                //ICountFingerprint cntFp = cfp.getCountFingerprint(mol); // debug
                //s = CircularFpToString(cfp); // debug
            }

            else if (fpType == FingerprintType.Extended)             // size = 1024
            {
                ifptr = new ExtendedFingerprinter();                 // use DEFAULT_SIZE and DEFAULT_SEARCH_DEPTH
            }

            else if (fpType == FingerprintType.EState)             // size = 128
            {
                ifptr = new EStateFingerprinter();                 // use DEFAULT_SIZE and DEFAULT_SEARCH_DEPTH
            }

            else if (fpType == FingerprintType.MACCS)             // size = 192
            {
                if (MACCSFp == null)
                {
                    MACCSFp = new MACCSFingerprinter();
                }

                ifptr = MACCSFp;
            }

            else if (fpType == FingerprintType.PubChem)             // size = 896
            {
                //IChemObjectBuilder builder = DefaultChemObjectBuilder.getInstance();
                ifptr = new PubchemFingerprinter();
            }

            else if (fpType == FingerprintType.ShortestPath)             // size =
            {
                ifptr = new ShortestPathFingerprinter();                 // fails with atom type issue for many structures (e.g. 123456)
            }

            else if (fpType == FingerprintType.Signature)             // size =
            {
                ifptr = new SignatureFingerprinter();                 // can't convert array fingerprint to bitsetfingerprint
            }

            else if (fpType == FingerprintType.Substructure)             // size = 320
            {
                ifptr = new SubstructureFingerprinter();
            }

            else
            {
                throw new Exception("Invalid CdkFingerprintType: " + fpType);
            }

            getFptrTime = TimeOfDay.Delta(ref t0);

            ibfp = ifptr.GetBitFingerprint(mol);
            bfp  = (BitSetFingerprint)ibfp;

            buildFpTime = TimeOfDay.Delta(ref t0);

            //long size = bfp.size();
            //int card = bfp.Cardinality;
            return(bfp);
        }