Exemplo n.º 1
0
        public void CosineDistanceTest()
        {
            CosineDistance dist = new CosineDistance();

            // TODO: Re-enable argument checking only for debugging
            // Assert.Throws<ArgumentException>(() => dist.GetDistance(p0, q4));

            double result = dist.GetDistance(p0, q0);

            Assert.AreEqual(result, .2, 0.00001);

            result = dist.GetDistance(p1, q1);
            Assert.AreEqual(result, 0.029857, 0.00001);

            result = dist.GetDistance(p2, q2);
            Assert.AreEqual(result, 1);

            result = dist.GetDistance(p3, q3);
            Assert.AreEqual(result, 0, 0.00001);

            result = dist.GetDistance(p4, q4);
            Assert.AreEqual(result, 0.039354, 0.00001);

            result = dist.GetDistance(p5, q5);
            Assert.AreEqual(result, 0.031026, 0.00001);
        }
    public static void Main()
    {
        modshogun.init_shogun_with_defaults();

        double[,] traindata_real = Load.load_numbers("../data/fm_train_real.dat");
        double[,] testdata_real  = Load.load_numbers("../data/fm_test_real.dat");

        RealFeatures feats_train = new RealFeatures(traindata_real);
        RealFeatures feats_test  = new RealFeatures(testdata_real);

        CosineDistance distance = new CosineDistance(feats_train, feats_train);

        double[,] dm_train = distance.get_distance_matrix();
        distance.init(feats_train, feats_test);
        double[,] dm_test = distance.get_distance_matrix();

        foreach (double item in dm_train)
        {
            Console.Write(item);
        }

        foreach (double item in dm_test)
        {
            Console.Write(item);
        }

        modshogun.exit_shogun();
    }
    public static void Main()
    {
        modshogun.init_shogun_with_defaults();

        double[,] traindata_real = Load.load_numbers("../data/fm_train_real.dat");
        double[,] testdata_real = Load.load_numbers("../data/fm_test_real.dat");

        RealFeatures feats_train = new RealFeatures(traindata_real);
        RealFeatures feats_test = new RealFeatures(testdata_real);

        CosineDistance distance = new CosineDistance(feats_train, feats_train);

        double[,] dm_train = distance.get_distance_matrix();
        distance.init(feats_train, feats_test);
        double[,] dm_test = distance.get_distance_matrix();

        foreach(double item in dm_train) {
            Console.Write(item);
        }

        foreach(double item in dm_test) {
            Console.Write(item);
        }

        modshogun.exit_shogun();
    }
Exemplo n.º 4
0
        public void CosineDistanceTest( )
        {
            CosineDistance dist = new CosineDistance( );

            Assert.Throws <ArgumentException>(() => dist.GetDistance(p0, q4));

            double result = dist.GetDistance(p0, q0);

            Assert.AreApproximatelyEqual(result, .2, 0.00001);

            result = dist.GetDistance(p1, q1);
            Assert.AreApproximatelyEqual(result, 0.029857, 0.00001);

            result = dist.GetDistance(p2, q2);
            Assert.AreEqual(result, 1);

            result = dist.GetDistance(p3, q3);
            Assert.AreApproximatelyEqual(result, 0, 0.00001);

            result = dist.GetDistance(p4, q4);
            Assert.AreApproximatelyEqual(result, 0.039354, 0.00001);

            result = dist.GetDistance(p5, q5);
            Assert.AreApproximatelyEqual(result, 0.031026, 0.00001);
        }
Exemplo n.º 5
0
    static void Main(string[] argv)
    {
        modshogun.init_shogun_with_defaults();

        DoubleMatrix traindata_real = Load.load_numbers("../data/fm_train_real.dat");
        DoubleMatrix testdata_real = Load.load_numbers("../data/fm_test_real.dat");

        RealFeatures feats_train = new RealFeatures(traindata_real);
        RealFeatures feats_test = new RealFeatures(testdata_real);

        CosineDistance distance = new CosineDistance(feats_train, feats_train);

        DoubleMatrix dm_train = distance.get_distance_matrix();
        distance.init(feats_train, feats_test);
        DoubleMatrix dm_test = distance.get_distance_matrix();

        Console.WriteLine(dm_train.ToString());
        Console.WriteLine(dm_test.ToString());

        modshogun.exit_shogun();
    }
Exemplo n.º 6
0
 internal static HandleRef getCPtr(CosineDistance obj)
 {
     return((obj == null) ? new HandleRef(null, IntPtr.Zero) : obj.swigCPtr);
 }
Exemplo n.º 7
0
        private static async Task Main()
        {
            Console.WriteLine("Reading posts from GitHub repo..");
            var posts = await GetBlogPosts();

            Console.WriteLine("Parsing documents..");
            Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));
            var language           = Language.English;
            var pipeline           = Pipeline.For(language);
            var postsWithDocuments = posts
                                     .Select(post =>
            {
                var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language)
                {
                    UID = post.Title.Hash128()
                };
                pipeline.ProcessSingle(document);
                return(Post: post, Document: document);
            })
                                     .ToArray(); // Call ToArray to force evaluation of the document processing now

            Console.WriteLine("Training FastText model..");
            var fastText = new FastText(language, version: 0, tag: "");

            fastText.Data.Type                  = FastText.ModelType.PVDM;
            fastText.Data.Loss                  = FastText.LossType.NegativeSampling;
            fastText.Data.IgnoreCase            = true;
            fastText.Data.Epoch                 = 50;
            fastText.Data.Dimensions            = 512;
            fastText.Data.MinimumCount          = 1;
            fastText.Data.ContextWindow         = 10;
            fastText.Data.NegativeSamplingCount = 20;
            fastText.Train(
                postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document),
                trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}")
                );

            Console.WriteLine("Building recommendations..");

            // Combine the blog post data with the FastText-generated vectors
            var results = fastText
                          .GetDocumentVectors()
                          .Select(result =>
            {
                // Each document vector instance will include a "token" string that may be mapped back to the
                // UID of the document for each blog post. If there were a large number of posts to deal with
                // then a dictionary to match UIDs to blog posts would be sensible for performance but I only
                // have a 100+ and so a LINQ "First" scan over the list will suffice.
                var uid           = UID128.Parse(result.Token);
                var postForResult = postsWithDocuments.First(
                    postWithDocument => postWithDocument.Document.UID == uid
                    );
                return(UID: uid, result.Vector, postForResult.Post);
            })
                          .ToArray(); // ToArray since we enumerate multiple times below

            // Construct a graph to search over, as described at
            // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph
            var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>(
                distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector),
                DefaultRandomGenerator.Instance,
                new() { M = 15, LevelLambda = 1 / Math.Log(15) }
                );

            graph.AddItems(results);

            // For every post, use the "KNNSearch" method on the graph to find the three most similar posts
            const int maximumNumberOfResultsToReturn = 3;
            var       postsWithSimilarResults        = results
                                                       .Select(result =>
            {
                // Request one result too many from the KNNSearch call because it's expected that the original
                // post will come back as the best match and we'll want to exclude that
                var similarResults = graph
                                     .KNNSearch(result, maximumNumberOfResultsToReturn + 1)
                                     .Where(similarResult => similarResult.Item.UID != result.UID)
                                     .Take(maximumNumberOfResultsToReturn); // Just in case the original post wasn't included

                return(new
                {
                    result.Post,
                    Similar = similarResults
                              .Select(similarResult => new
                    {
                        similarResult.Item.Post,
                        similarResult.Distance
                    })
                              .ToArray()
                });
            })
                                                       .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase)
                                                       .ToArray();

            foreach (var postWithSimilarResults in postsWithSimilarResults)
            {
                Console.WriteLine();
                Console.WriteLine(postWithSimilarResults.Post.Title);
                foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance))
                {
                    Console.WriteLine($"{similarResult.Distance:0.000} {similarResult.Post.Title}");
                }
            }

            Console.WriteLine();
            Console.WriteLine("Done! Press [Enter] to terminate..");
            Console.ReadLine();
        }
Exemplo n.º 8
0
 internal static HandleRef getCPtr(CosineDistance obj) {
   return (obj == null) ? new HandleRef(null, IntPtr.Zero) : obj.swigCPtr;
 }
Exemplo n.º 9
0
        private static async Task Main()
        {
            Console.WriteLine("Reading posts from GitHub repo..");
            var posts = await GetBlogPosts();

            Console.WriteLine("Parsing documents..");
            Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));
            var language           = Language.English;
            var pipeline           = Pipeline.For(language);
            var postsWithDocuments = posts
                                     .Select(post =>
            {
                var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language)
                {
                    UID = post.Title.Hash128()
                };
                pipeline.ProcessSingle(document);
                return(Post: post, Document: document);
            })
                                     .ToArray(); // Call ToArray to force evaluation of the document processing now

            Console.WriteLine("Training FastText model..");
            var fastText = new FastText(language, version: 0, tag: "");

            fastText.Data.Type                  = FastText.ModelType.PVDM;
            fastText.Data.Loss                  = FastText.LossType.NegativeSampling;
            fastText.Data.IgnoreCase            = true;
            fastText.Data.Epoch                 = 50;
            fastText.Data.Dimensions            = 512;
            fastText.Data.MinimumCount          = 1;
            fastText.Data.ContextWindow         = 10;
            fastText.Data.NegativeSamplingCount = 20;
            fastText.Train(
                postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document),
                trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}")
                );

            Console.WriteLine("Training TF-IDF model..");
            var tfidf = new TFIDF(pipeline.Language, version: 0, tag: "");
            await tfidf.Train(postsWithDocuments.Select(postWithDocument => postWithDocument.Document));

            Console.WriteLine("Getting average TF-IDF weights per word..");
            var tokenValueTFIDF = new Dictionary <string, List <float> >(StringComparer.OrdinalIgnoreCase);

            foreach (var doc in postsWithDocuments.Select(postWithDocument => postWithDocument.Document))
            {
                tfidf.Process(doc);
                foreach (var sentence in doc)
                {
                    foreach (var token in sentence)
                    {
                        if (!tokenValueTFIDF.TryGetValue(token.Value, out var freqs))
                        {
                            freqs = new();
                            tokenValueTFIDF.Add(token.Value, freqs);
                        }
                        freqs.Add(token.Frequency);
                    }
                }
            }
            var averagedTokenValueTFIDF = tokenValueTFIDF.ToDictionary(
                entry => entry.Key,
                entry => entry.Value.Average(), StringComparer.OrdinalIgnoreCase
                );

            Console.WriteLine("Building recommendations..");

            // Combine the blog post data with the FastText-generated vectors
            var results = fastText
                          .GetDocumentVectors()
                          .Select(result =>
            {
                // Each document vector instance will include a "token" string that may be mapped back to the
                // UID of the document for each blog post. If there were a large number of posts to deal with
                // then a dictionary to match UIDs to blog posts would be sensible for performance but I only
                // have a 100+ and so a LINQ "First" scan over the list will suffice.
                var uid           = UID128.Parse(result.Token);
                var postForResult = postsWithDocuments.First(
                    postWithDocument => postWithDocument.Document.UID == uid
                    );
                return(UID: uid, result.Vector, postForResult.Post);
            })
                          .ToArray(); // ToArray since we enumerate multiple times below

            // Construct a graph to search over, as described at
            // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph
            var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>(
                distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector),
                DefaultRandomGenerator.Instance,
                new() { M = 15, LevelLambda = 1 / Math.Log(15) }
                );

            graph.AddItems(results);

            const int maximumNumberOfResultsToReturn = 3;
            var       postsWithSimilarResults        = results
                                                       .Select(result =>
            {
                // Request that the KNNSearch operate over all documents because we can't take the top {n}
                // until we've combined the ordering with the title TFIDF proximity values
                var similarResults = graph
                                     .KNNSearch(result, postsWithDocuments.Length)
                                     .Where(similarResult => similarResult.Item.UID != result.UID);

                var tokenValuesInTitle =
                    GetAllTokensForText(NormaliseSomeCommonTerms(result.Post.Title), pipeline)
                    .Select(token => token.Value)
                    .ToHashSet(StringComparer.OrdinalIgnoreCase);

                return(new
                {
                    result.Post,
                    Similar = similarResults
                              .Select(similarResult => new
                    {
                        similarResult.Item.Post,
                        similarResult.Distance,
                        ProximityByTitleTFIDF = GetProximityByTitleTFIDF(
                            NormaliseSomeCommonTerms(similarResult.Item.Post.Title),
                            tokenValuesInTitle,
                            averagedTokenValueTFIDF,
                            pipeline
                            )
                    })
                              .OrderByDescending(similarResult => similarResult.ProximityByTitleTFIDF)
                              .ThenBy(similarResult => similarResult.Distance)
                              .Take(maximumNumberOfResultsToReturn)
                              .ToArray()
                });
            })
                                                       .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase)
                                                       .ToArray();

            foreach (var postWithSimilarResults in postsWithSimilarResults)
            {
                Console.WriteLine();
                Console.WriteLine(postWithSimilarResults.Post.Title);
                foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance))
                {
                    Console.WriteLine($"{similarResult.ProximityByTitleTFIDF:0.000} {similarResult.Distance:0.000} {similarResult.Post.Title}");
                }
            }

            Console.WriteLine();
            Console.WriteLine("Done! Press [Enter] to terminate..");
            Console.ReadLine();
        }
Exemplo n.º 10
0
        private DistanceMeasure CreateMeasure(string processName, string dirName, DistanceMeasures measure, PDB.PDBMODE atoms, bool jury1d, string alignFileName,
                                              string profileName = null, string refJuryProfile = null)
        {
            DistanceMeasure dist = null;

            switch (measure)
            {
            case DistanceMeasures.HAMMING:
                if (alignFileName != null)
                {
                    dist = new JuryDistance(alignFileName, jury1d, profileName, refJuryProfile);
                }
                else
                {
                    dist = new JuryDistance(dirName, alignFileName, jury1d, profileName, refJuryProfile);
                }
                break;

            case DistanceMeasures.TANIMOTO:
                if (alignFileName != null)
                {
                    dist = new Tanimoto(alignFileName, jury1d, profileName, refJuryProfile);
                }
                else
                {
                    dist = new Tanimoto(dirName, alignFileName, jury1d, profileName, refJuryProfile);
                }
                break;

            case DistanceMeasures.COSINE:
                if (alignFileName != null)
                {
                    dist = new CosineDistance(alignFileName, jury1d, profileName, refJuryProfile);
                }
                else
                {
                    dist = new CosineDistance(dirName, alignFileName, jury1d, profileName, refJuryProfile);
                }
                break;

            case DistanceMeasures.RMSD:
                if (dirName == null)
                {
                    throw new Exception("RMSD and MAXSUB measures cannot be used for aligned profiles!");
                }
                dist = new Rmsd(dirName, alignFileName, jury1d, atoms, refJuryProfile);
                break;

            case DistanceMeasures.MAXSUB:
                if (dirName == null)
                {
                    throw new Exception("RMSD and MAXSUB measures cannot be used for aligned profiles!");
                }
                dist = new MaxSub(dirName, alignFileName, jury1d, refJuryProfile);
                break;

            case DistanceMeasures.GDT_TS:
                if (dirName == null)
                {
                    throw new Exception("RMSD and MAXSUB measures cannot be used for aligned profiles!");
                }
                dist = new GDT_TS(dirName, alignFileName, jury1d, refJuryProfile);
                break;
            }
            return(dist);
        }