public void TestVectorizer() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var extractor = new CountFeatureExtractor(); extractor.Sentences = tokenizer.Tokenize(Corpus()); extractor.Vectorize(new List <string>()); var vectors = Vectors(); for (int i = 0; i < extractor.Sentences.Count; i++) { var sentence = extractor.Sentences[i]; for (int j = 0; j < extractor.Features.Count; j++) { var word = sentence.Words.Find(w => w.Lemma == extractor.Features[j]); if (word != null) { Assert.IsTrue(word.Vector == vectors[i][j]); } } } }
public List <Node[]> GetData(List <Sentence> sentences, ClassifyOptions options) { var extractor = new CountFeatureExtractor(); //var extractor = new Word2VecFeatureExtractor(); extractor.ModelFile = options.Word2VecFilePath; extractor.Sentences = sentences; if (features != null) { extractor.Features = features; } if (dictionary != null) { extractor.Dictionary = dictionary; } extractor.Vectorize(featuresInTfIdf); if (features == null) { features = extractor.Features; } if (dictionary == null) { dictionary = extractor.Dictionary; } List <Node[]> datas = new List <Node[]>(); foreach (var sentence in sentences) { List <Node> curNodes = new List <Node>(); for (int i = 0; i < extractor.Features.Count; i++) { int name = i; /*var xx = sentence.Words.Find(x => x.Lemma == extractor.Features[i]); * * if (xx == null) * { * curNodes.Add(new Node(name, 0)); * } * else * { * curNodes.Add(new Node(name, xx.Vector)); * }*/ curNodes.Add(new Node(i, sentence.Vector[i])); } datas.Add(curNodes.ToArray()); } return(datas); }
public List <Node[]> GetData(List <Sentence> sentences) { var extractor = new CountFeatureExtractor(); extractor.Sentences = sentences; if (features != null) { extractor.Features = features; } if (dictionary != null) { extractor.Dictionary = dictionary; } extractor.Vectorize(); if (features == null) { features = extractor.Features; } if (dictionary == null) { dictionary = extractor.Dictionary; } List <Node[]> datas = new List <Node[]>(); foreach (var sentence in sentences) { List <Node> curNodes = new List <Node>(); for (int i = 0; i < extractor.Features.Count; i++) { int name = i; var xx = sentence.Words.Find(x => x.Lemma == extractor.Features[i]); if (xx == null) { curNodes.Add(new Node(name, 0)); } else { curNodes.Add(new Node(name, xx.Vector)); } } datas.Add(curNodes.ToArray()); } return(datas); }