Exemplo n.º 1
0
        private static async Task Main()
        {
            Console.WriteLine("Reading posts from GitHub repo..");
            var posts = await GetBlogPosts();

            Console.WriteLine("Parsing documents..");
            Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));
            var language           = Language.English;
            var pipeline           = Pipeline.For(language);
            var postsWithDocuments = posts
                                     .Select(post =>
            {
                var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language)
                {
                    UID = post.Title.Hash128()
                };
                pipeline.ProcessSingle(document);
                return(Post: post, Document: document);
            })
                                     .ToArray(); // Call ToArray to force evaluation of the document processing now

            Console.WriteLine("Training FastText model..");
            var fastText = new FastText(language, version: 0, tag: "");

            fastText.Data.Type                  = FastText.ModelType.PVDM;
            fastText.Data.Loss                  = FastText.LossType.NegativeSampling;
            fastText.Data.IgnoreCase            = true;
            fastText.Data.Epoch                 = 50;
            fastText.Data.Dimensions            = 512;
            fastText.Data.MinimumCount          = 1;
            fastText.Data.ContextWindow         = 10;
            fastText.Data.NegativeSamplingCount = 20;
            fastText.Train(
                postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document),
                trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}")
                );

            Console.WriteLine("Building recommendations..");

            // Combine the blog post data with the FastText-generated vectors
            var results = fastText
                          .GetDocumentVectors()
                          .Select(result =>
            {
                // Each document vector instance will include a "token" string that may be mapped back to the
                // UID of the document for each blog post. If there were a large number of posts to deal with
                // then a dictionary to match UIDs to blog posts would be sensible for performance but I only
                // have a 100+ and so a LINQ "First" scan over the list will suffice.
                var uid           = UID128.Parse(result.Token);
                var postForResult = postsWithDocuments.First(
                    postWithDocument => postWithDocument.Document.UID == uid
                    );
                return(UID: uid, result.Vector, postForResult.Post);
            })
                          .ToArray(); // ToArray since we enumerate multiple times below

            // Construct a graph to search over, as described at
            // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph
            var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>(
                distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector),
                DefaultRandomGenerator.Instance,
                new() { M = 15, LevelLambda = 1 / Math.Log(15) }
                );

            graph.AddItems(results);

            // For every post, use the "KNNSearch" method on the graph to find the three most similar posts
            const int maximumNumberOfResultsToReturn = 3;
            var       postsWithSimilarResults        = results
                                                       .Select(result =>
            {
                // Request one result too many from the KNNSearch call because it's expected that the original
                // post will come back as the best match and we'll want to exclude that
                var similarResults = graph
                                     .KNNSearch(result, maximumNumberOfResultsToReturn + 1)
                                     .Where(similarResult => similarResult.Item.UID != result.UID)
                                     .Take(maximumNumberOfResultsToReturn); // Just in case the original post wasn't included

                return(new
                {
                    result.Post,
                    Similar = similarResults
                              .Select(similarResult => new
                    {
                        similarResult.Item.Post,
                        similarResult.Distance
                    })
                              .ToArray()
                });
            })
                                                       .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase)
                                                       .ToArray();

            foreach (var postWithSimilarResults in postsWithSimilarResults)
            {
                Console.WriteLine();
                Console.WriteLine(postWithSimilarResults.Post.Title);
                foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance))
                {
                    Console.WriteLine($"{similarResult.Distance:0.000} {similarResult.Post.Title}");
                }
            }

            Console.WriteLine();
            Console.WriteLine("Done! Press [Enter] to terminate..");
            Console.ReadLine();
        }
Exemplo n.º 2
0
        private static async Task Main()
        {
            Console.WriteLine("Reading posts from GitHub repo..");
            var posts = await GetBlogPosts();

            Console.WriteLine("Parsing documents..");
            Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));
            var language           = Language.English;
            var pipeline           = Pipeline.For(language);
            var postsWithDocuments = posts
                                     .Select(post =>
            {
                var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language)
                {
                    UID = post.Title.Hash128()
                };
                pipeline.ProcessSingle(document);
                return(Post: post, Document: document);
            })
                                     .ToArray(); // Call ToArray to force evaluation of the document processing now

            Console.WriteLine("Training FastText model..");
            var fastText = new FastText(language, version: 0, tag: "");

            fastText.Data.Type                  = FastText.ModelType.PVDM;
            fastText.Data.Loss                  = FastText.LossType.NegativeSampling;
            fastText.Data.IgnoreCase            = true;
            fastText.Data.Epoch                 = 50;
            fastText.Data.Dimensions            = 512;
            fastText.Data.MinimumCount          = 1;
            fastText.Data.ContextWindow         = 10;
            fastText.Data.NegativeSamplingCount = 20;
            fastText.Train(
                postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document),
                trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}")
                );

            Console.WriteLine("Training TF-IDF model..");
            var tfidf = new TFIDF(pipeline.Language, version: 0, tag: "");
            await tfidf.Train(postsWithDocuments.Select(postWithDocument => postWithDocument.Document));

            Console.WriteLine("Getting average TF-IDF weights per word..");
            var tokenValueTFIDF = new Dictionary <string, List <float> >(StringComparer.OrdinalIgnoreCase);

            foreach (var doc in postsWithDocuments.Select(postWithDocument => postWithDocument.Document))
            {
                tfidf.Process(doc);
                foreach (var sentence in doc)
                {
                    foreach (var token in sentence)
                    {
                        if (!tokenValueTFIDF.TryGetValue(token.Value, out var freqs))
                        {
                            freqs = new();
                            tokenValueTFIDF.Add(token.Value, freqs);
                        }
                        freqs.Add(token.Frequency);
                    }
                }
            }
            var averagedTokenValueTFIDF = tokenValueTFIDF.ToDictionary(
                entry => entry.Key,
                entry => entry.Value.Average(), StringComparer.OrdinalIgnoreCase
                );

            Console.WriteLine("Building recommendations..");

            // Combine the blog post data with the FastText-generated vectors
            var results = fastText
                          .GetDocumentVectors()
                          .Select(result =>
            {
                // Each document vector instance will include a "token" string that may be mapped back to the
                // UID of the document for each blog post. If there were a large number of posts to deal with
                // then a dictionary to match UIDs to blog posts would be sensible for performance but I only
                // have a 100+ and so a LINQ "First" scan over the list will suffice.
                var uid           = UID128.Parse(result.Token);
                var postForResult = postsWithDocuments.First(
                    postWithDocument => postWithDocument.Document.UID == uid
                    );
                return(UID: uid, result.Vector, postForResult.Post);
            })
                          .ToArray(); // ToArray since we enumerate multiple times below

            // Construct a graph to search over, as described at
            // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph
            var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>(
                distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector),
                DefaultRandomGenerator.Instance,
                new() { M = 15, LevelLambda = 1 / Math.Log(15) }
                );

            graph.AddItems(results);

            const int maximumNumberOfResultsToReturn = 3;
            var       postsWithSimilarResults        = results
                                                       .Select(result =>
            {
                // Request that the KNNSearch operate over all documents because we can't take the top {n}
                // until we've combined the ordering with the title TFIDF proximity values
                var similarResults = graph
                                     .KNNSearch(result, postsWithDocuments.Length)
                                     .Where(similarResult => similarResult.Item.UID != result.UID);

                var tokenValuesInTitle =
                    GetAllTokensForText(NormaliseSomeCommonTerms(result.Post.Title), pipeline)
                    .Select(token => token.Value)
                    .ToHashSet(StringComparer.OrdinalIgnoreCase);

                return(new
                {
                    result.Post,
                    Similar = similarResults
                              .Select(similarResult => new
                    {
                        similarResult.Item.Post,
                        similarResult.Distance,
                        ProximityByTitleTFIDF = GetProximityByTitleTFIDF(
                            NormaliseSomeCommonTerms(similarResult.Item.Post.Title),
                            tokenValuesInTitle,
                            averagedTokenValueTFIDF,
                            pipeline
                            )
                    })
                              .OrderByDescending(similarResult => similarResult.ProximityByTitleTFIDF)
                              .ThenBy(similarResult => similarResult.Distance)
                              .Take(maximumNumberOfResultsToReturn)
                              .ToArray()
                });
            })
                                                       .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase)
                                                       .ToArray();

            foreach (var postWithSimilarResults in postsWithSimilarResults)
            {
                Console.WriteLine();
                Console.WriteLine(postWithSimilarResults.Post.Title);
                foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance))
                {
                    Console.WriteLine($"{similarResult.ProximityByTitleTFIDF:0.000} {similarResult.Distance:0.000} {similarResult.Post.Title}");
                }
            }

            Console.WriteLine();
            Console.WriteLine("Done! Press [Enter] to terminate..");
            Console.ReadLine();
        }