public void OneHotTest()
        {
            var reader    = new FasttextDataReader();
            var sentences = reader.Read(new ReaderOptions
            {
                DataDir  = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                FileName = "cooking.stackexchange.txt"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            var encoder = new OneHotEncoder();

            encoder.Sentences = sentences;
            encoder.EncodeAll();
        }
示例#2
0
        public void Train(List <Sentence> sentences, ClassifyOptions options)
        {
            var tfidf = new TfidfFeatureExtractor();

            tfidf.Dimension = options.Dimension;
            tfidf.Sentences = sentences;
            tfidf.CalBasedOnCategory();

            var encoder = new OneHotEncoder();

            encoder.Sentences = sentences;
            encoder.Words     = tfidf.Keywords();
            words             = encoder.EncodeAll();

            var featureSets = sentences.Select(x => new Tuple <string, double[]>(x.Label, x.Vector)).ToList();

            labelDist = featureSets.GroupBy(x => x.Item1)
                        .Select(x => new Probability
            {
                Value = x.Key,
                Freq  = x.Count()
            })
                        .OrderBy(x => x.Value)
                        .ToList();

            nb.LabelDist  = labelDist;
            nb.FeatureSet = featureSets;

            // calculate prior prob
            labelDist.ForEach(l => l.Prob = nb.CalPriorProb(l.Value));

            // calculate posterior prob
            // loop features
            var featureCount = nb.FeatureSet[0].Item2.Length;

            labelDist.ForEach(label =>
            {
                for (int x = 0; x < featureCount; x++)
                {
                    for (int v = 0; v < features.Length; v++)
                    {
                        string key = $"{label.Value} f{x} {features[v]}";
                        condProbDictionary[key] = nb.CalCondProb(x, label.Value, features[v]);
                    }
                }
            });
        }