Esempio n. 1
0
        static async Task Main(string[] args)
        {
            Console.OutputEncoding = Encoding.UTF8;
            ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole()));

            //Configures the model storage to use the online repository backed by the local folder ./catalyst-models/
            Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));


            //Download the Reuters corpus if necessary
            var(train, test) = await Corpus.Reuters.GetAsync();

            //Parse the documents using the English pipeline, as the text data is untokenized so far
            var nlp = Pipeline.For(Language.English);

            var trainDocs = nlp.Process(train).ToArray();
            var testDocs  = nlp.Process(test).ToArray();

            //Train a FastText supervised classifier with a multi-label loss (OneVsAll)
            var fastText = new FastText(Language.English, 0, "Reuters-Classifier");

            fastText.Data.Type                    = FastText.ModelType.Supervised;
            fastText.Data.Loss                    = FastText.LossType.OneVsAll;
            fastText.Data.LearningRate            = 1f;
            fastText.Data.Dimensions              = 256;
            fastText.Data.Epoch                   = 100;
            fastText.Data.MinimumWordNgramsCounts = 5;
            fastText.Data.MaximumWordNgrams       = 3;
            fastText.Data.MinimumCount            = 5;

            fastText.Train(trainDocs);

            //You can also auto-tune the model using the algorithm from https://ai.facebook.com/blog/fasttext-blog-post-open-source-in-brief/
            fastText.AutoTuneTrain(trainDocs, testDocs, new FastText.AutoTuneOptions());

            //Compute predictions
            Dictionary <IDocument, Dictionary <string, float> > predTrain, predTest;

            using (new Measure(Logger, "Computing train-set predictions", trainDocs.Length))
            {
                predTrain = trainDocs.AsParallel().Select(d => (Doc: d, Pred: fastText.Predict(d))).ToDictionary(d => d.Doc, d => d.Pred);
            }

            using (new Measure(Logger, "Computing test set predictions", testDocs.Length))
            {
                predTest = testDocs.AsParallel().Select(d => (Doc: d, Pred: fastText.Predict(d))).ToDictionary(d => d.Doc, d => d.Pred);
            }

            var resultsTrain = ComputeStats(predTrain);
            var resultsTest  = ComputeStats(predTest);

            Console.WriteLine("\n\n\n--- Results ---\n\n\n");
            foreach (var res in resultsTrain.Zip(resultsTest))
            {
                Console.WriteLine($"\tScore cutoff: {res.First.Cutoff:n2} Train: F1={res.First.F1:n2} P={res.First.Precision:n2} R={res.First.Recall:n2} Test: F1={res.Second.F1:n2} P={res.Second.Precision:n2} R={res.Second.Recall:n2}");
            }

            Console.ReadLine();
        }
        public new static async Task <bool> DeleteAsync(Language language, int version, string tag)
        {
            var  a       = new FastTextLanguageDetector(version);
            bool deleted = false;

            deleted |= await FastText.DeleteAsync(Language.Any, version, "language-detector");

            deleted |= await a.DeleteDataAsync();

            return(deleted);
        }
Esempio n. 3
0
        public new static async Task <bool> DeleteAsync(Language language, int version, string tag)
        {
            var  a       = new FastText(language, version, tag);
            bool deleted = false;

            deleted |= await DataStore.DeleteAsync(language, nameof(StarSpaceModel) + "-Matrix", version, tag + "-lhs");

            deleted |= await DataStore.DeleteAsync(language, nameof(StarSpaceModel) + "-Matrix", version, tag + "-rhs");

            deleted |= await a.DeleteDataAsync();

            return(deleted);
        }
Esempio n. 4
0
        public static async Task TxtClassification()
        {
            Storage.Current  = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));
            var(train, test) = await Corpus.Reuters.GetAsync();

            var nlp = Pipeline.For(Language.English);

            var trainDocs = nlp.Process(train).ToArray();
            var testDocs  = nlp.Process(test).ToArray();

            var fastText = new FastText(Language.English, 0, "Reuters-Classifier");

            fastText.Data.Type                    = FastText.ModelType.Supervised;
            fastText.Data.Loss                    = FastText.LossType.OneVsAll;
            fastText.Data.LearningRate            = 1f;
            fastText.Data.Dimensions              = 256;
            fastText.Data.Epoch                   = 100;
            fastText.Data.MinimumWordNgramsCounts = 5;
            fastText.Data.MaximumWordNgrams       = 3;
            fastText.Data.MinimumCount            = 5;

            fastText.Train(trainDocs);

            fastText.AutoTuneTrain(trainDocs, testDocs, new FastText.AutoTuneOptions());

            Dictionary <IDocument, Dictionary <string, float> > predTrain, predTest;

            using (new Measure(Logger, "Computing train-set predictions", trainDocs.Length))
            {
                predTrain = trainDocs.AsParallel().Select(d => (Doc: d, Pred: fastText.Predict(d))).ToDictionary(d => d.Doc, d => d.Pred);
            }

            using (new Measure(Logger, "Computing test set predictions", testDocs.Length))
            {
                predTest = testDocs.AsParallel().Select(d => (Doc: d, Pred: fastText.Predict(d))).ToDictionary(d => d.Doc, d => d.Pred);
            }

            var resultsTrain = ComputeStats(predTrain);
            var resultsTest  = ComputeStats(predTest);

            Console.WriteLine("\n\n\n--- Results ---\n\n\n");
            foreach (var res in resultsTrain.Zip(resultsTest))
            {
                Console.WriteLine($"\tScore cutoff: {res.First.Cutoff:n2} Train: F1={res.First.F1:n2} P={res.First.Precision:n2} R={res.First.Recall:n2} Test: F1={res.Second.F1:n2} P={res.Second.Precision:n2} R={res.Second.Recall:n2}");
            }

            Console.ReadLine();
        }
        public FastTextLanguageDetector(int version) : base(Language.Any, version, nameof(FastTextLanguageDetector), compress: false)
        {
            Model           = new FastText(Language.Any, version, "language-detector");
            Model.Data.Type = FastText.ModelType.Supervised;
            Model.Data.MaximumWordNgrams  = 0;
            Model.Data.MinimumNgrams      = 2;
            Model.Data.MaximumNgrams      = 5;
            Model.Data.VectorQuantization = QuantizationType.None;
            Model.Data.LearningRate       = 0.1f;
            Model.Data.Epoch        = 50;
            Model.Data.Dimensions   = 16;
            Model.Data.IgnoreCase   = false;
            Model.Data.Loss         = FastText.LossType.NegativeSampling;
            Model.Data.MinimumCount = 5;

            Tokenizer = new SpaceTokenizer();
        }
Esempio n. 6
0
        public void ComputeAlignment(FastText source, FastText target)
        {
            //find words that exist on both models
            int dim = source.Data.Dimensions;

            if (target.Data.Dimensions != dim)
            {
                throw new Exception("Source and Target models must have the same dimensions!");
            }
            int N    = 10_000;
            var dict = TranslationDictionary.GetDictionary(source.Language, target.Language, N * 3);

            var A = new Matrix(dict.Count, dim);
            var B = new Matrix(dict.Count, dim);
            int k = 0;

            foreach (var kv in dict)
            {
                if ((source.GetWordIndex(kv.Key) > -1) && (target.GetWordIndex(kv.Value) > -1))
                {
                    Dictionary.Add(kv.Key, kv.Value);

                    A[k] = source.GetVector(kv.Key, source.Language);
                    B[k] = target.GetVector(kv.Value, target.Language);
                    k++;
                }
                if (k == N)
                {
                    break;
                }
            }

            A.ResizeAndFillRows(k, 0);
            B.ResizeAndFillRows(k, 0);

            var U = B.Transpose().Multiply(A);

            CalculateSVD(U.ToArray(), out float[] w, out float[][] v);

            TranslationMatrix = U.Multiply(new Matrix(v).Transpose());

            //M = A*B
            //U,V = SVD(M)
            //W = U dot V'
        }
        public new static async Task <FastTextLanguageDetector> FromStoreAsync(Language language, int version, string tag)
        {
            var a = new FastTextLanguageDetector(version);

            //Because we use the model name as the tag of this model, we've to check for formernames here
            try
            {
                await a.LoadDataAsync();
            }
            catch (FileNotFoundException)
            {
                if (ObjectStore.TryGetFormerNames(nameof(FastTextLanguageDetector), out var formerNames))
                {
                    var correctTag = a.Tag;
                    foreach (var formerName in formerNames)
                    {
                        try
                        {
                            a.Tag = formerName;
                            await a.LoadDataAsync();

                            a.Tag = correctTag;
                            break;
                        }
                        catch (FileNotFoundException)
                        {
                            //ignore
                        }
                    }
                }
            }

            a.Model = await FastText.FromStoreAsync_Internal(Language.Any, version, "language-detector");

            a.Model?.CompactSupervisedModel();

            return(a);
        }
Esempio n. 8
0
        private static async Task Main()
        {
            Console.WriteLine("Reading posts from GitHub repo..");
            var posts = await GetBlogPosts();

            Console.WriteLine("Parsing documents..");
            Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));
            var language           = Language.English;
            var pipeline           = Pipeline.For(language);
            var postsWithDocuments = posts
                                     .Select(post =>
            {
                var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language)
                {
                    UID = post.Title.Hash128()
                };
                pipeline.ProcessSingle(document);
                return(Post: post, Document: document);
            })
                                     .ToArray(); // Call ToArray to force evaluation of the document processing now

            Console.WriteLine("Training FastText model..");
            var fastText = new FastText(language, version: 0, tag: "");

            fastText.Data.Type                  = FastText.ModelType.PVDM;
            fastText.Data.Loss                  = FastText.LossType.NegativeSampling;
            fastText.Data.IgnoreCase            = true;
            fastText.Data.Epoch                 = 50;
            fastText.Data.Dimensions            = 512;
            fastText.Data.MinimumCount          = 1;
            fastText.Data.ContextWindow         = 10;
            fastText.Data.NegativeSamplingCount = 20;
            fastText.Train(
                postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document),
                trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}")
                );

            Console.WriteLine("Building recommendations..");

            // Combine the blog post data with the FastText-generated vectors
            var results = fastText
                          .GetDocumentVectors()
                          .Select(result =>
            {
                // Each document vector instance will include a "token" string that may be mapped back to the
                // UID of the document for each blog post. If there were a large number of posts to deal with
                // then a dictionary to match UIDs to blog posts would be sensible for performance but I only
                // have a 100+ and so a LINQ "First" scan over the list will suffice.
                var uid           = UID128.Parse(result.Token);
                var postForResult = postsWithDocuments.First(
                    postWithDocument => postWithDocument.Document.UID == uid
                    );
                return(UID: uid, result.Vector, postForResult.Post);
            })
                          .ToArray(); // ToArray since we enumerate multiple times below

            // Construct a graph to search over, as described at
            // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph
            var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>(
                distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector),
                DefaultRandomGenerator.Instance,
                new() { M = 15, LevelLambda = 1 / Math.Log(15) }
                );

            graph.AddItems(results);

            // For every post, use the "KNNSearch" method on the graph to find the three most similar posts
            const int maximumNumberOfResultsToReturn = 3;
            var       postsWithSimilarResults        = results
                                                       .Select(result =>
            {
                // Request one result too many from the KNNSearch call because it's expected that the original
                // post will come back as the best match and we'll want to exclude that
                var similarResults = graph
                                     .KNNSearch(result, maximumNumberOfResultsToReturn + 1)
                                     .Where(similarResult => similarResult.Item.UID != result.UID)
                                     .Take(maximumNumberOfResultsToReturn); // Just in case the original post wasn't included

                return(new
                {
                    result.Post,
                    Similar = similarResults
                              .Select(similarResult => new
                    {
                        similarResult.Item.Post,
                        similarResult.Distance
                    })
                              .ToArray()
                });
            })
                                                       .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase)
                                                       .ToArray();

            foreach (var postWithSimilarResults in postsWithSimilarResults)
            {
                Console.WriteLine();
                Console.WriteLine(postWithSimilarResults.Post.Title);
                foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance))
                {
                    Console.WriteLine($"{similarResult.Distance:0.000} {similarResult.Post.Title}");
                }
            }

            Console.WriteLine();
            Console.WriteLine("Done! Press [Enter] to terminate..");
            Console.ReadLine();
        }
        private static async Task Main()
        {
            Console.WriteLine("Reading posts from GitHub repo..");
            var posts = await GetBlogPosts();

            Console.WriteLine("Parsing documents..");
            Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));
            var language           = Language.English;
            var pipeline           = Pipeline.For(language);
            var postsWithDocuments = posts
                                     .Select(post =>
            {
                var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language)
                {
                    UID = post.Title.Hash128()
                };
                pipeline.ProcessSingle(document);
                return(Post: post, Document: document);
            })
                                     .ToArray(); // Call ToArray to force evaluation of the document processing now

            Console.WriteLine("Training FastText model..");
            var fastText = new FastText(language, version: 0, tag: "");

            fastText.Data.Type                  = FastText.ModelType.PVDM;
            fastText.Data.Loss                  = FastText.LossType.NegativeSampling;
            fastText.Data.IgnoreCase            = true;
            fastText.Data.Epoch                 = 50;
            fastText.Data.Dimensions            = 512;
            fastText.Data.MinimumCount          = 1;
            fastText.Data.ContextWindow         = 10;
            fastText.Data.NegativeSamplingCount = 20;
            fastText.Train(
                postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document),
                trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}")
                );

            Console.WriteLine("Training TF-IDF model..");
            var tfidf = new TFIDF(pipeline.Language, version: 0, tag: "");
            await tfidf.Train(postsWithDocuments.Select(postWithDocument => postWithDocument.Document));

            Console.WriteLine("Getting average TF-IDF weights per word..");
            var tokenValueTFIDF = new Dictionary <string, List <float> >(StringComparer.OrdinalIgnoreCase);

            foreach (var doc in postsWithDocuments.Select(postWithDocument => postWithDocument.Document))
            {
                tfidf.Process(doc);
                foreach (var sentence in doc)
                {
                    foreach (var token in sentence)
                    {
                        if (!tokenValueTFIDF.TryGetValue(token.Value, out var freqs))
                        {
                            freqs = new();
                            tokenValueTFIDF.Add(token.Value, freqs);
                        }
                        freqs.Add(token.Frequency);
                    }
                }
            }
            var averagedTokenValueTFIDF = tokenValueTFIDF.ToDictionary(
                entry => entry.Key,
                entry => entry.Value.Average(), StringComparer.OrdinalIgnoreCase
                );

            Console.WriteLine("Building recommendations..");

            // Combine the blog post data with the FastText-generated vectors
            var results = fastText
                          .GetDocumentVectors()
                          .Select(result =>
            {
                // Each document vector instance will include a "token" string that may be mapped back to the
                // UID of the document for each blog post. If there were a large number of posts to deal with
                // then a dictionary to match UIDs to blog posts would be sensible for performance but I only
                // have a 100+ and so a LINQ "First" scan over the list will suffice.
                var uid           = UID128.Parse(result.Token);
                var postForResult = postsWithDocuments.First(
                    postWithDocument => postWithDocument.Document.UID == uid
                    );
                return(UID: uid, result.Vector, postForResult.Post);
            })
                          .ToArray(); // ToArray since we enumerate multiple times below

            // Construct a graph to search over, as described at
            // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph
            var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>(
                distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector),
                DefaultRandomGenerator.Instance,
                new() { M = 15, LevelLambda = 1 / Math.Log(15) }
                );

            graph.AddItems(results);

            const int maximumNumberOfResultsToReturn = 3;
            var       postsWithSimilarResults        = results
                                                       .Select(result =>
            {
                // Request that the KNNSearch operate over all documents because we can't take the top {n}
                // until we've combined the ordering with the title TFIDF proximity values
                var similarResults = graph
                                     .KNNSearch(result, postsWithDocuments.Length)
                                     .Where(similarResult => similarResult.Item.UID != result.UID);

                var tokenValuesInTitle =
                    GetAllTokensForText(NormaliseSomeCommonTerms(result.Post.Title), pipeline)
                    .Select(token => token.Value)
                    .ToHashSet(StringComparer.OrdinalIgnoreCase);

                return(new
                {
                    result.Post,
                    Similar = similarResults
                              .Select(similarResult => new
                    {
                        similarResult.Item.Post,
                        similarResult.Distance,
                        ProximityByTitleTFIDF = GetProximityByTitleTFIDF(
                            NormaliseSomeCommonTerms(similarResult.Item.Post.Title),
                            tokenValuesInTitle,
                            averagedTokenValueTFIDF,
                            pipeline
                            )
                    })
                              .OrderByDescending(similarResult => similarResult.ProximityByTitleTFIDF)
                              .ThenBy(similarResult => similarResult.Distance)
                              .Take(maximumNumberOfResultsToReturn)
                              .ToArray()
                });
            })
                                                       .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase)
                                                       .ToArray();

            foreach (var postWithSimilarResults in postsWithSimilarResults)
            {
                Console.WriteLine();
                Console.WriteLine(postWithSimilarResults.Post.Title);
                foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance))
                {
                    Console.WriteLine($"{similarResult.ProximityByTitleTFIDF:0.000} {similarResult.Distance:0.000} {similarResult.Post.Title}");
                }
            }

            Console.WriteLine();
            Console.WriteLine("Done! Press [Enter] to terminate..");
            Console.ReadLine();
        }