public CntkTrainDataSet Transform(DataSet dataSet)
        {
            var segmentator = new MeCabTextSegmentator();
            var topicStore  = new CntkTopicStore();
            var corpus      = new HashSet <string>();
            int sentenceId  = 0;

            var n = 0;
            //var count = sentences.Count;
            List <CntkSentence> sentences = dataSet.Select(row =>
            {
                n++;
                if (n % 10 == 0)
                {
                    Console.WriteLine(n + " sentences");
                }
                CntkTopic topic = topicStore.GetOrRegister(row.Label);
                // クレンジングをかけつつ単語収集
                List <string> words = segmentator.Split(row.Sentence);

                words.ForEach(w => corpus.Add(w));
                return(new CntkSentence()
                {
                    Id = sentenceId++,
                    Topic = topic,
                    Sentence = row.Sentence,
                    Words = words.Select(w => new CntkWord()
                    {
                        Text = w
                    }).ToList(),
                });
            }).ToList();

            // 取り込む単語が全て確定しないとVector表現が決まらないので、そこだけ最後
            var space = OneHotWordVectorSpace.Build(corpus);

            foreach (var sentence in sentences)
            {
                foreach (var word in sentence.Words)
                {
                    word.Value = space.ToVector(word.Text);
                }
            }

            var trainData = new CntkTrainDataSet(topicStore.Indexer, space, sentences);

            return(trainData);
        }
Example #2
0
        /// <summary>
        /// 文章を分類器にかけるための単語シーケンスに変換します。
        /// </summary>
        /// <param name="query"></param>
        /// <param name="recipe"></param>
        /// <returns></returns>
        private List <string> ToWordSequence(string query)
        {
            List <string> wordSequence = segmantator.Split(query, true);

            return(wordSequence);
        }