예제 #1
0
        public List <Tuple <string, double> > Classify(Sentence sentence, ClassifyOptions options)
        {
            var encoder = new OneHotEncoder();

            encoder.Words = words;
            encoder.Encode(sentence);

            var results = new List <Tuple <string, double> >();

            // calculate prop
            labelDist.ForEach(lf =>
            {
                var prob = nb.CalPosteriorProb(lf.Value, sentence.Vector, lf.Prob, condProbDictionary);
                results.Add(new Tuple <string, double>(lf.Value, prob));
            });

            /*Parallel.ForEach(labelDist, (lf) =>
             * {
             *  nb.Y = lf.Value;
             *  lf.Prob = nb.PosteriorProb();
             * });*/

            double total = results.Select(x => x.Item2).Sum();

            return(results.Select(x => new Tuple <string, double>(x.Item1, x.Item2 / total)).ToList());
        }
예제 #2
0
        public void OneHotTest()
        {
            var reader    = new FasttextDataReader();
            var sentences = reader.Read(new ReaderOptions
            {
                DataDir  = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                FileName = "cooking.stackexchange.txt"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            var encoder = new OneHotEncoder();

            encoder.Sentences = sentences;
            encoder.EncodeAll();
        }
예제 #3
0
        public void Train(List <Sentence> sentences, ClassifyOptions options)
        {
            var tfidf = new TfidfFeatureExtractor();

            tfidf.Dimension = options.Dimension;
            tfidf.Sentences = sentences;
            tfidf.CalBasedOnCategory();

            var encoder = new OneHotEncoder();

            encoder.Sentences = sentences;
            encoder.Words     = tfidf.Keywords();
            words             = encoder.EncodeAll();

            var featureSets = sentences.Select(x => new Tuple <string, double[]>(x.Label, x.Vector)).ToList();

            labelDist = featureSets.GroupBy(x => x.Item1)
                        .Select(x => new Probability
            {
                Value = x.Key,
                Freq  = x.Count()
            })
                        .OrderBy(x => x.Value)
                        .ToList();

            nb.LabelDist  = labelDist;
            nb.FeatureSet = featureSets;

            // calculate prior prob
            labelDist.ForEach(l => l.Prob = nb.CalPriorProb(l.Value));

            // calculate posterior prob
            // loop features
            var featureCount = nb.FeatureSet[0].Item2.Length;

            labelDist.ForEach(label =>
            {
                for (int x = 0; x < featureCount; x++)
                {
                    for (int v = 0; v < features.Length; v++)
                    {
                        string key = $"{label.Value} f{x} {features[v]}";
                        condProbDictionary[key] = nb.CalCondProb(x, label.Value, features[v]);
                    }
                }
            });
        }