コード例 #1
ファイル: Program.cs プロジェクト: uzbekdev1/catalyst
        static async Task Main(string[] args)
            Console.OutputEncoding = Encoding.UTF8;
            ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole()));

            //Configures the model storage to use the online repository backed by the local folder ./catalyst-models/
            Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));

            //Download the Reuters corpus if necessary
            var(train, test) = await Corpus.Reuters.GetAsync();

            //Parse the documents using the English pipeline, as the text data is untokenized so far
            var nlp = Pipeline.For(Language.English);

            var trainDocs = nlp.Process(train).ToArray();
            var testDocs  = nlp.Process(test).ToArray();

            //Train a FastText supervised classifier with a multi-label loss (OneVsAll)
            var fastText = new FastText(Language.English, 0, "Reuters-Classifier");

            fastText.Data.Type                    = FastText.ModelType.Supervised;
            fastText.Data.Loss                    = FastText.LossType.OneVsAll;
            fastText.Data.LearningRate            = 1f;
            fastText.Data.Dimensions              = 256;
            fastText.Data.Epoch                   = 100;
            fastText.Data.MinimumWordNgramsCounts = 5;
            fastText.Data.MaximumWordNgrams       = 3;
            fastText.Data.MinimumCount            = 5;


            //You can also auto-tune the model using the algorithm from https://ai.facebook.com/blog/fasttext-blog-post-open-source-in-brief/
            fastText.AutoTuneTrain(trainDocs, testDocs, new FastText.AutoTuneOptions());

            //Compute predictions
            Dictionary <IDocument, Dictionary <string, float> > predTrain, predTest;

            using (new Measure(Logger, "Computing train-set predictions", trainDocs.Length))
                predTrain = trainDocs.AsParallel().Select(d => (Doc: d, Pred: fastText.Predict(d))).ToDictionary(d => d.Doc, d => d.Pred);

            using (new Measure(Logger, "Computing test set predictions", testDocs.Length))
                predTest = testDocs.AsParallel().Select(d => (Doc: d, Pred: fastText.Predict(d))).ToDictionary(d => d.Doc, d => d.Pred);

            var resultsTrain = ComputeStats(predTrain);
            var resultsTest  = ComputeStats(predTest);

            Console.WriteLine("\n\n\n--- Results ---\n\n\n");
            foreach (var res in resultsTrain.Zip(resultsTest))
                Console.WriteLine($"\tScore cutoff: {res.First.Cutoff:n2} Train: F1={res.First.F1:n2} P={res.First.Precision:n2} R={res.First.Recall:n2} Test: F1={res.Second.F1:n2} P={res.Second.Precision:n2} R={res.Second.Recall:n2}");

コード例 #2
        public new static async Task <bool> DeleteAsync(Language language, int version, string tag)
            var  a       = new FastTextLanguageDetector(version);
            bool deleted = false;

            deleted |= await FastText.DeleteAsync(Language.Any, version, "language-detector");

            deleted |= await a.DeleteDataAsync();

コード例 #3
        public new static async Task <bool> DeleteAsync(Language language, int version, string tag)
            var  a       = new FastText(language, version, tag);
            bool deleted = false;

            deleted |= await DataStore.DeleteAsync(language, nameof(StarSpaceModel) + "-Matrix", version, tag + "-lhs");

            deleted |= await DataStore.DeleteAsync(language, nameof(StarSpaceModel) + "-Matrix", version, tag + "-rhs");

            deleted |= await a.DeleteDataAsync();

コード例 #5
        public FastTextLanguageDetector(int version) : base(Language.Any, version, nameof(FastTextLanguageDetector), compress: false)
            Model           = new FastText(Language.Any, version, "language-detector");
            Model.Data.Type = FastText.ModelType.Supervised;
            Model.Data.MaximumWordNgrams  = 0;
            Model.Data.MinimumNgrams      = 2;
            Model.Data.MaximumNgrams      = 5;
            Model.Data.VectorQuantization = QuantizationType.None;
            Model.Data.LearningRate       = 0.1f;
            Model.Data.Epoch        = 50;
            Model.Data.Dimensions   = 16;
            Model.Data.IgnoreCase   = false;
            Model.Data.Loss         = FastText.LossType.NegativeSampling;
            Model.Data.MinimumCount = 5;

            Tokenizer = new SpaceTokenizer();
コード例 #6
        public void ComputeAlignment(FastText source, FastText target)
            //find words that exist on both models
            int dim = source.Data.Dimensions;

            if (target.Data.Dimensions != dim)
                throw new Exception("Source and Target models must have the same dimensions!");
            int N    = 10_000;
            var dict = TranslationDictionary.GetDictionary(source.Language, target.Language, N * 3);

            var A = new Matrix(dict.Count, dim);
            var B = new Matrix(dict.Count, dim);
            int k = 0;

            foreach (var kv in dict)
                if ((source.GetWordIndex(kv.Key) > -1) && (target.GetWordIndex(kv.Value) > -1))
                    Dictionary.Add(kv.Key, kv.Value);

                    A[k] = source.GetVector(kv.Key, source.Language);
                    B[k] = target.GetVector(kv.Value, target.Language);
                if (k == N)

            A.ResizeAndFillRows(k, 0);
            B.ResizeAndFillRows(k, 0);

            var U = B.Transpose().Multiply(A);

            CalculateSVD(U.ToArray(), out float[] w, out float[][] v);

            TranslationMatrix = U.Multiply(new Matrix(v).Transpose());

            //M = A*B
            //U,V = SVD(M)
            //W = U dot V'
コード例 #7
        public new static async Task <FastTextLanguageDetector> FromStoreAsync(Language language, int version, string tag)
            var a = new FastTextLanguageDetector(version);

            //Because we use the model name as the tag of this model, we've to check for formernames here
                await a.LoadDataAsync();
            catch (FileNotFoundException)
                if (ObjectStore.TryGetFormerNames(nameof(FastTextLanguageDetector), out var formerNames))
                    var correctTag = a.Tag;
                    foreach (var formerName in formerNames)
                            a.Tag = formerName;
                            await a.LoadDataAsync();

                            a.Tag = correctTag;
                        catch (FileNotFoundException)

            a.Model = await FastText.FromStoreAsync_Internal(Language.Any, version, "language-detector");


コード例 #8
        private static async Task Main()
            Console.WriteLine("Reading posts from GitHub repo..");
            var posts = await GetBlogPosts();

            Console.WriteLine("Parsing documents..");
            Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));
            var language           = Language.English;
            var pipeline           = Pipeline.For(language);
            var postsWithDocuments = posts
                                     .Select(post =>
                var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language)
                    UID = post.Title.Hash128()
                return(Post: post, Document: document);
                                     .ToArray(); // Call ToArray to force evaluation of the document processing now

            Console.WriteLine("Training FastText model..");
            var fastText = new FastText(language, version: 0, tag: "");

            fastText.Data.Type                  = FastText.ModelType.PVDM;
            fastText.Data.Loss                  = FastText.LossType.NegativeSampling;
            fastText.Data.IgnoreCase            = true;
            fastText.Data.Epoch                 = 50;
            fastText.Data.Dimensions            = 512;
            fastText.Data.MinimumCount          = 1;
            fastText.Data.ContextWindow         = 10;
            fastText.Data.NegativeSamplingCount = 20;
                postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document),
                trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}")

            Console.WriteLine("Building recommendations..");

            // Combine the blog post data with the FastText-generated vectors
            var results = fastText
                          .Select(result =>
                // Each document vector instance will include a "token" string that may be mapped back to the
                // UID of the document for each blog post. If there were a large number of posts to deal with
                // then a dictionary to match UIDs to blog posts would be sensible for performance but I only
                // have a 100+ and so a LINQ "First" scan over the list will suffice.
                var uid           = UID128.Parse(result.Token);
                var postForResult = postsWithDocuments.First(
                    postWithDocument => postWithDocument.Document.UID == uid
                return(UID: uid, result.Vector, postForResult.Post);
                          .ToArray(); // ToArray since we enumerate multiple times below

            // Construct a graph to search over, as described at
            // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph
            var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>(
                distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector),
                new() { M = 15, LevelLambda = 1 / Math.Log(15) }


            // For every post, use the "KNNSearch" method on the graph to find the three most similar posts
            const int maximumNumberOfResultsToReturn = 3;
            var       postsWithSimilarResults        = results
                                                       .Select(result =>
                // Request one result too many from the KNNSearch call because it's expected that the original
                // post will come back as the best match and we'll want to exclude that
                var similarResults = graph
                                     .KNNSearch(result, maximumNumberOfResultsToReturn + 1)
                                     .Where(similarResult => similarResult.Item.UID != result.UID)
                                     .Take(maximumNumberOfResultsToReturn); // Just in case the original post wasn't included

                    Similar = similarResults
                              .Select(similarResult => new
                                                       .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase)

            foreach (var postWithSimilarResults in postsWithSimilarResults)
                foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance))
                    Console.WriteLine($"{similarResult.Distance:0.000} {similarResult.Post.Title}");

            Console.WriteLine("Done! Press [Enter] to terminate..");
コード例 #9
        private static async Task Main()
            Console.WriteLine("Reading posts from GitHub repo..");
            var posts = await GetBlogPosts();

            Console.WriteLine("Parsing documents..");
            Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));
            var language           = Language.English;
            var pipeline           = Pipeline.For(language);
            var postsWithDocuments = posts
                                     .Select(post =>
                var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language)
                    UID = post.Title.Hash128()
                return(Post: post, Document: document);
                                     .ToArray(); // Call ToArray to force evaluation of the document processing now

            Console.WriteLine("Training FastText model..");
            var fastText = new FastText(language, version: 0, tag: "");

            fastText.Data.Type                  = FastText.ModelType.PVDM;
            fastText.Data.Loss                  = FastText.LossType.NegativeSampling;
            fastText.Data.IgnoreCase            = true;
            fastText.Data.Epoch                 = 50;
            fastText.Data.Dimensions            = 512;
            fastText.Data.MinimumCount          = 1;
            fastText.Data.ContextWindow         = 10;
            fastText.Data.NegativeSamplingCount = 20;
                postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document),
                trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}")

            Console.WriteLine("Training TF-IDF model..");
            var tfidf = new TFIDF(pipeline.Language, version: 0, tag: "");
            await tfidf.Train(postsWithDocuments.Select(postWithDocument => postWithDocument.Document));

            Console.WriteLine("Getting average TF-IDF weights per word..");
            var tokenValueTFIDF = new Dictionary <string, List <float> >(StringComparer.OrdinalIgnoreCase);

            foreach (var doc in postsWithDocuments.Select(postWithDocument => postWithDocument.Document))
                foreach (var sentence in doc)
                    foreach (var token in sentence)
                        if (!tokenValueTFIDF.TryGetValue(token.Value, out var freqs))
                            freqs = new();
                            tokenValueTFIDF.Add(token.Value, freqs);
            var averagedTokenValueTFIDF = tokenValueTFIDF.ToDictionary(
                entry => entry.Key,
                entry => entry.Value.Average(), StringComparer.OrdinalIgnoreCase

            Console.WriteLine("Building recommendations..");

            // Combine the blog post data with the FastText-generated vectors
            var results = fastText
                          .Select(result =>
                // Each document vector instance will include a "token" string that may be mapped back to the
                // UID of the document for each blog post. If there were a large number of posts to deal with
                // then a dictionary to match UIDs to blog posts would be sensible for performance but I only
                // have a 100+ and so a LINQ "First" scan over the list will suffice.
                var uid           = UID128.Parse(result.Token);
                var postForResult = postsWithDocuments.First(
                    postWithDocument => postWithDocument.Document.UID == uid
                return(UID: uid, result.Vector, postForResult.Post);
                          .ToArray(); // ToArray since we enumerate multiple times below

            // Construct a graph to search over, as described at
            // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph
            var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>(
                distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector),
                new() { M = 15, LevelLambda = 1 / Math.Log(15) }


            const int maximumNumberOfResultsToReturn = 3;
            var       postsWithSimilarResults        = results
                                                       .Select(result =>
                // Request that the KNNSearch operate over all documents because we can't take the top {n}
                // until we've combined the ordering with the title TFIDF proximity values
                var similarResults = graph
                                     .KNNSearch(result, postsWithDocuments.Length)
                                     .Where(similarResult => similarResult.Item.UID != result.UID);

                var tokenValuesInTitle =
                    GetAllTokensForText(NormaliseSomeCommonTerms(result.Post.Title), pipeline)
                    .Select(token => token.Value)

                    Similar = similarResults
                              .Select(similarResult => new
                        ProximityByTitleTFIDF = GetProximityByTitleTFIDF(
                              .OrderByDescending(similarResult => similarResult.ProximityByTitleTFIDF)
                              .ThenBy(similarResult => similarResult.Distance)
                                                       .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase)

            foreach (var postWithSimilarResults in postsWithSimilarResults)
                foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance))
                    Console.WriteLine($"{similarResult.ProximityByTitleTFIDF:0.000} {similarResult.Distance:0.000} {similarResult.Post.Title}");

            Console.WriteLine("Done! Press [Enter] to terminate..");