예제 #1
0
        public void TestVectorizer()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var extractor = new CountFeatureExtractor();

            extractor.Sentences = tokenizer.Tokenize(Corpus());
            extractor.Vectorize(new List <string>());

            var vectors = Vectors();

            for (int i = 0; i < extractor.Sentences.Count; i++)
            {
                var sentence = extractor.Sentences[i];

                for (int j = 0; j < extractor.Features.Count; j++)
                {
                    var word = sentence.Words.Find(w => w.Lemma == extractor.Features[j]);

                    if (word != null)
                    {
                        Assert.IsTrue(word.Vector == vectors[i][j]);
                    }
                }
            }
        }
예제 #2
0
        public List <Node[]> GetData(List <Sentence> sentences, ClassifyOptions options)
        {
            var extractor = new CountFeatureExtractor();

            //var extractor = new Word2VecFeatureExtractor();
            extractor.ModelFile = options.Word2VecFilePath;
            extractor.Sentences = sentences;
            if (features != null)
            {
                extractor.Features = features;
            }

            if (dictionary != null)
            {
                extractor.Dictionary = dictionary;
            }

            extractor.Vectorize(featuresInTfIdf);

            if (features == null)
            {
                features = extractor.Features;
            }

            if (dictionary == null)
            {
                dictionary = extractor.Dictionary;
            }

            List <Node[]> datas = new List <Node[]>();

            foreach (var sentence in sentences)
            {
                List <Node> curNodes = new List <Node>();

                for (int i = 0; i < extractor.Features.Count; i++)
                {
                    int name = i;

                    /*var xx = sentence.Words.Find(x => x.Lemma == extractor.Features[i]);
                     *
                     * if (xx == null)
                     * {
                     *  curNodes.Add(new Node(name, 0));
                     * }
                     * else
                     * {
                     *  curNodes.Add(new Node(name, xx.Vector));
                     * }*/

                    curNodes.Add(new Node(i, sentence.Vector[i]));
                }

                datas.Add(curNodes.ToArray());
            }
            return(datas);
        }
예제 #3
0
        public List <Node[]> GetData(List <Sentence> sentences)
        {
            var extractor = new CountFeatureExtractor();

            extractor.Sentences = sentences;
            if (features != null)
            {
                extractor.Features = features;
            }

            if (dictionary != null)
            {
                extractor.Dictionary = dictionary;
            }

            extractor.Vectorize();

            if (features == null)
            {
                features = extractor.Features;
            }

            if (dictionary == null)
            {
                dictionary = extractor.Dictionary;
            }

            List <Node[]> datas = new List <Node[]>();

            foreach (var sentence in sentences)
            {
                List <Node> curNodes = new List <Node>();

                for (int i = 0; i < extractor.Features.Count; i++)
                {
                    int name = i;
                    var xx   = sentence.Words.Find(x => x.Lemma == extractor.Features[i]);

                    if (xx == null)
                    {
                        curNodes.Add(new Node(name, 0));
                    }
                    else
                    {
                        curNodes.Add(new Node(name, xx.Vector));
                    }
                }

                datas.Add(curNodes.ToArray());
            }
            return(datas);
        }