public CntkTrainDataSet Transform(DataSet dataSet) { var segmentator = new MeCabTextSegmentator(); var topicStore = new CntkTopicStore(); var corpus = new HashSet <string>(); int sentenceId = 0; var n = 0; //var count = sentences.Count; List <CntkSentence> sentences = dataSet.Select(row => { n++; if (n % 10 == 0) { Console.WriteLine(n + " sentences"); } CntkTopic topic = topicStore.GetOrRegister(row.Label); // クレンジングをかけつつ単語収集 List <string> words = segmentator.Split(row.Sentence); words.ForEach(w => corpus.Add(w)); return(new CntkSentence() { Id = sentenceId++, Topic = topic, Sentence = row.Sentence, Words = words.Select(w => new CntkWord() { Text = w }).ToList(), }); }).ToList(); // 取り込む単語が全て確定しないとVector表現が決まらないので、そこだけ最後 var space = OneHotWordVectorSpace.Build(corpus); foreach (var sentence in sentences) { foreach (var word in sentence.Words) { word.Value = space.ToVector(word.Text); } } var trainData = new CntkTrainDataSet(topicStore.Indexer, space, sentences); return(trainData); }
/// <summary> /// 文章を分類器にかけるための単語シーケンスに変換します。 /// </summary> /// <param name="query"></param> /// <param name="recipe"></param> /// <returns></returns> private List <string> ToWordSequence(string query) { List <string> wordSequence = segmantator.Split(query, true); return(wordSequence); }