예제 #1
0
        public List <Tuple <string, double> > Classify(Sentence sentence, ClassifyOptions options)
        {
            var encoder = new OneHotEncoder();

            encoder.Words = words;
            encoder.Encode(sentence);

            var results = new List <Tuple <string, double> >();

            // calculate prop
            labelDist.ForEach(lf =>
            {
                var prob = nb.CalPosteriorProb(lf.Value, sentence.Vector, lf.Prob, condProbDictionary);
                results.Add(new Tuple <string, double>(lf.Value, prob));
            });

            /*Parallel.ForEach(labelDist, (lf) =>
             * {
             *  nb.Y = lf.Value;
             *  lf.Prob = nb.PosteriorProb();
             * });*/

            double total = results.Select(x => x.Item2).Sum();

            return(results.Select(x => new Tuple <string, double>(x.Item1, x.Item2 / total)).ToList());
        }
예제 #2
0
 public ClassifierFactory(ClassifyOptions options, SupportedLanguage lang)
 {
     _lang            = lang;
     _options         = options;
     _classifier      = new IClassify();
     featureExtractor = new IFeatureExtractor();
 }
예제 #3
0
        public List <Node[]> GetData(List <Sentence> sentences, ClassifyOptions options)
        {
            var extractor = new CountFeatureExtractor();

            //var extractor = new Word2VecFeatureExtractor();
            extractor.ModelFile = options.Word2VecFilePath;
            extractor.Sentences = sentences;
            if (features != null)
            {
                extractor.Features = features;
            }

            if (dictionary != null)
            {
                extractor.Dictionary = dictionary;
            }

            extractor.Vectorize(featuresInTfIdf);

            if (features == null)
            {
                features = extractor.Features;
            }

            if (dictionary == null)
            {
                dictionary = extractor.Dictionary;
            }

            List <Node[]> datas = new List <Node[]>();

            foreach (var sentence in sentences)
            {
                List <Node> curNodes = new List <Node>();

                for (int i = 0; i < extractor.Features.Count; i++)
                {
                    int name = i;

                    /*var xx = sentence.Words.Find(x => x.Lemma == extractor.Features[i]);
                     *
                     * if (xx == null)
                     * {
                     *  curNodes.Add(new Node(name, 0));
                     * }
                     * else
                     * {
                     *  curNodes.Add(new Node(name, xx.Vector));
                     * }*/

                    curNodes.Add(new Node(i, sentence.Vector[i]));
                }

                datas.Add(curNodes.ToArray());
            }
            return(datas);
        }
예제 #4
0
        public List <Tuple <string, double> > Classify(Sentence sentence, ClassifyOptions options)
        {
            var categoryList = new List <Tuple <string, double> >();

            var result = Predict(sentence, options).FirstOrDefault();

            for (int i = 0; i < result.Length; i++)
            {
                categoryList.Add(new Tuple <string, double>(categories[i], result[i]));
            }

            return(categoryList);
        }
예제 #5
0
        public List <Tuple <string, double> > Classify(Sentence sentence)
        {
            var options = new ClassifyOptions
            {
                ModelFilePath = _options.ModelFilePath
            };

            _classifier.LoadModel(options);

            var classes = _classifier.Classify(sentence, options);

            classes = classes.OrderByDescending(x => x.Item2).ToList();

            return(classes);
        }
예제 #6
0
        public double[][] Predict(FeaturesWithLabel featureSet, ClassifyOptions options)
        {
            Problem predict = new Problem();
            List <FeaturesWithLabel> featureSets = new List <FeaturesWithLabel>();

            featureSets.Add(featureSet);
            predict.X        = GetData(featureSets).ToArray();
            predict.Y        = new double[1];
            predict.Count    = predict.X.Count();
            predict.MaxIndex = 300;

            RangeTransform transform = options.Transform;
            Problem        scaled    = transform.Scale(predict);

            return(Prediction.PredictLabelsProbability(options.Model, scaled));
        }
예제 #7
0
        public double[][] Predict(Sentence sentence, ClassifyOptions options)
        {
            Problem predict = new Problem();

            predict.X = GetData(new List <Sentence> {
                sentence
            }).ToArray();
            predict.Y        = new double[1];
            predict.Count    = predict.X.Count();
            predict.MaxIndex = features.Count;

            transform = options.Transform;
            Problem scaled = transform.Scale(predict);

            return(Prediction.PredictLabelsProbability(model, scaled));
        }
예제 #8
0
        public void Train(List <Sentence> sentences, ClassifyOptions options)
        {
            var tfidf = new TfIdfFeatureExtractor();

            tfidf.Dimension = options.Dimension;
            tfidf.Sentences = sentences;
            tfidf.CalBasedOnCategory();

            var encoder = new OneHotEncoder();

            encoder.Sentences = sentences;
            encoder.Words     = tfidf.Keywords();
            words             = encoder.EncodeAll();

            var featureSets = sentences.Select(x => new Tuple <string, double[]>(x.Label, x.Vector)).ToList();

            labelDist = featureSets.GroupBy(x => x.Item1)
                        .Select(x => new Probability
            {
                Value = x.Key,
                Freq  = x.Count()
            })
                        .OrderBy(x => x.Value)
                        .ToList();

            nb.LabelDist  = labelDist;
            nb.FeatureSet = featureSets;

            // calculate prior prob
            labelDist.ForEach(l => l.Prob = nb.CalPriorProb(l.Value));

            // calculate posterior prob
            // loop features
            var featureCount = nb.FeatureSet[0].Item2.Length;

            labelDist.ForEach(label =>
            {
                for (int x = 0; x < featureCount; x++)
                {
                    for (int v = 0; v < features.Length; v++)
                    {
                        string key = $"{label.Value} f{x} {features[v]}";
                        condProbDictionary[key] = nb.CalCondProb(x, label.Value, features[v]);
                    }
                }
            });
        }
예제 #9
0
        public void SVMClassifierTrain(List <Sentence> sentences, ClassifyOptions options, SvmType svm = SvmType.C_SVC, KernelType kernel = KernelType.RBF, bool probability = true, string outputFile = null)
        {
            var tfidf = new TfIdfFeatureExtractor();

            tfidf.Dimension = options.Dimension;
            tfidf.Sentences = sentences;
            tfidf.CalBasedOnCategory();
            featuresInTfIdf = tfidf.Keywords();

            // copy test multiclass Model
            Problem train = new Problem();

            train.X        = GetData(sentences, options).ToArray();
            train.Y        = GetLabels(sentences).ToArray();
            train.Count    = train.X.Count();
            train.MaxIndex = train.X[0].Count();//int.MaxValue;

            Parameter param = new Parameter();

            transform = RangeTransform.Compute(train);
            Problem scaled = transform.Scale(train);

            param.Gamma       = 1.0 / 3;
            param.SvmType     = svm;
            param.KernelType  = kernel;
            param.Probability = probability;

            int numberOfClasses = train.Y.OrderBy(x => x).Distinct().Count();

            if (numberOfClasses == 1)
            {
                Console.Write("Number of classes must greater than one!");
            }

            if (svm == SvmType.C_SVC)
            {
                for (int i = 0; i < numberOfClasses; i++)
                {
                    param.Weights[i] = 1;
                }
            }

            model = Training.Train(scaled, param);

            Console.Write("Training finished!");
        }
예제 #10
0
        public string SaveModel(ClassifyOptions options)
        {
            options.TransformFilePath  = Path.Combine(options.ModelDir, "transform");
            options.FeaturesFileName   = Path.Combine(options.ModelDir, "features");
            options.DictionaryFileName = Path.Combine(options.ModelDir, "dictionary");
            options.CategoriesFileName = Path.Combine(options.ModelDir, "categories");

            File.WriteAllText(options.FeaturesFileName, JsonConvert.SerializeObject(features));

            File.WriteAllText(options.DictionaryFileName, JsonConvert.SerializeObject(dictionary));

            File.WriteAllText(options.CategoriesFileName, JsonConvert.SerializeObject(categories));

            RangeTransform.Write(options.TransformFilePath, transform);
            Bigtree.Algorithm.SVM.Model.Write(options.ModelFilePath, model);

            return(options.ModelFilePath);
        }
예제 #11
0
        public string SaveModel(ClassifyOptions options)
        {
            // save the model
            var model = new MultinomiaNaiveBayesModel
            {
                LabelDist          = labelDist,
                CondProbDictionary = condProbDictionary,
                Values             = words
            };

            //save the file
            using (var bw = new BinaryWriter(new FileStream(options.ModelFilePath, FileMode.Create)))
            {
                var bytes = Encoding.UTF8.GetBytes(JsonConvert.SerializeObject(model));
                bw.Write(bytes);
            }

            return(options.ModelFilePath);
        }
예제 #12
0
        object IClassifier.LoadModel(ClassifyOptions options)
        {
            options.FeaturesFileName   = Path.Combine(options.ModelDir, "features");
            options.DictionaryFileName = Path.Combine(options.ModelDir, "dictionary");
            options.ModelFilePath      = Path.Combine(options.ModelDir, options.ModelName);
            options.TransformFilePath  = Path.Combine(options.ModelDir, "transform");
            options.CategoriesFileName = Path.Combine(options.ModelDir, "categories");

            features = JsonConvert.DeserializeObject <List <String> >(File.ReadAllText(options.FeaturesFileName));

            dictionary = JsonConvert.DeserializeObject <List <Tuple <string, int> > >(File.ReadAllText(options.DictionaryFileName));

            categories = JsonConvert.DeserializeObject <List <String> >(File.ReadAllText(options.CategoriesFileName));

            model = Bigtree.Algorithm.SVM.Model.Read(options.ModelFilePath);

            options.Transform = RangeTransform.Read(options.TransformFilePath);

            return(model);
        }
예제 #13
0
        public Object LoadModel(ClassifyOptions options)
        {
            string json = String.Empty;

            //read the file
            using (var br = new BinaryReader(new FileStream(options.ModelFilePath, FileMode.Open)))
            {
                byte[] bytes = br.ReadBytes((int)br.BaseStream.Length);

                json = Encoding.UTF8.GetString(bytes);
            }

            var model = JsonConvert.DeserializeObject <MultinomiaNaiveBayesModel>(json);

            labelDist          = model.LabelDist;
            condProbDictionary = model.CondProbDictionary;
            words = model.Values;

            return(model);
        }
예제 #14
0
        public void SVMClassifierTrain(List <Sentence> sentences, ClassifyOptions options, SvmType svm = SvmType.C_SVC, KernelType kernel = KernelType.RBF, bool probability = true, string outputFile = null)
        {
            // copy test multiclass Model
            Problem train = new Problem();

            train.X        = GetData(sentences).ToArray();
            train.Y        = GetLabels(sentences).ToArray();
            train.Count    = train.X.Count();
            train.MaxIndex = train.X[0].Count();//int.MaxValue;

            Parameter param = new Parameter();

            transform = RangeTransform.Compute(train);
            Problem scaled = transform.Scale(train);

            param.Gamma       = 1.0 / 3;
            param.SvmType     = svm;
            param.KernelType  = kernel;
            param.Probability = probability;

            int numberOfClasses = train.Y.OrderBy(x => x).Distinct().Count();

            if (numberOfClasses == 1)
            {
                throw new ArgumentException("Number of classes can't be one!");
            }
            if (svm == SvmType.C_SVC)
            {
                for (int i = 0; i < numberOfClasses; i++)
                {
                    param.Weights[i] = 1;
                }
            }

            model = Training.Train(scaled, param);

            Console.Write("Training finished!");
        }
예제 #15
0
        public void SVMClassifierTrain(List <FeaturesWithLabel> featureSets, ClassifyOptions options, SvmType svm = SvmType.C_SVC, KernelType kernel = KernelType.RBF, bool probability = true, string outputFile = null)
        {
            // copy test multiclass Model
            Problem train = new Problem();

            train.X        = GetData(featureSets).ToArray();
            train.Y        = GetLabels(featureSets).ToArray();
            train.Count    = train.X.Count();
            train.MaxIndex = 300;//int.MaxValue;

            Parameter      param     = new Parameter();
            RangeTransform transform = RangeTransform.Compute(train);
            Problem        scaled    = transform.Scale(train);

            param.Gamma       = 1.0 / 3;
            param.SvmType     = svm;
            param.KernelType  = kernel;
            param.Probability = probability;

            int numberOfClasses = train.Y.Distinct().Count();

            if (numberOfClasses == 1)
            {
                throw new ArgumentException("Number of classes can't be one!");
            }
            if (svm == SvmType.C_SVC)
            {
                for (int i = 0; i < numberOfClasses; i++)
                {
                    param.Weights[i] = 1;
                }
            }
            var model = Training.Train(scaled, param);

            RangeTransform.Write(options.TransformFilePath, transform);
            SVM.BotSharp.MachineLearning.Model.Write(options.ModelFilePath, model);
            Console.Write("Training finished!");
        }
예제 #16
0
 public void Train(List <Sentence> sentences, ClassifyOptions options)
 {
     SVMClassifierTrain(sentences, options);
 }
예제 #17
0
 public List <Tuple <string, double> > Classify(Sentence sentence, ClassifyOptions options)
 {
     throw new NotImplementedException();
 }
예제 #18
0
 public void Train(List <Sentence> sentences, ClassifyOptions options)
 {
     // SVMClassifierTrain(featureSets, options);
 }
예제 #19
0
 object IClassifier.LoadModel(ClassifyOptions options)
 {
     throw new NotImplementedException();
 }
예제 #20
0
 public string SaveModel(ClassifyOptions options)
 {
     throw new NotImplementedException();
 }