public void OneHotTest() { var reader = new FasttextDataReader(); var sentences = reader.Read(new ReaderOptions { DataDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), FileName = "cooking.stackexchange.txt" }); var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); for (int i = 0; i < newSentences.Count; i++) { newSentences[i].Label = sentences[i].Label; } sentences = newSentences.ToList(); var encoder = new OneHotEncoder(); encoder.Sentences = sentences; encoder.EncodeAll(); }
public void Train(List <Sentence> sentences, ClassifyOptions options) { var tfidf = new TfidfFeatureExtractor(); tfidf.Dimension = options.Dimension; tfidf.Sentences = sentences; tfidf.CalBasedOnCategory(); var encoder = new OneHotEncoder(); encoder.Sentences = sentences; encoder.Words = tfidf.Keywords(); words = encoder.EncodeAll(); var featureSets = sentences.Select(x => new Tuple <string, double[]>(x.Label, x.Vector)).ToList(); labelDist = featureSets.GroupBy(x => x.Item1) .Select(x => new Probability { Value = x.Key, Freq = x.Count() }) .OrderBy(x => x.Value) .ToList(); nb.LabelDist = labelDist; nb.FeatureSet = featureSets; // calculate prior prob labelDist.ForEach(l => l.Prob = nb.CalPriorProb(l.Value)); // calculate posterior prob // loop features var featureCount = nb.FeatureSet[0].Item2.Length; labelDist.ForEach(label => { for (int x = 0; x < featureCount; x++) { for (int v = 0; v < features.Length; v++) { string key = $"{label.Value} f{x} {features[v]}"; condProbDictionary[key] = nb.CalCondProb(x, label.Value, features[v]); } } }); }