public void Export(CntkTrainDataSet trainData)
        {
            Console.WriteLine("creating export_info.json in folder : Data ");
            var info = this.CreateExportInfo(trainData);

            string filePath = @"Data\export_info.json";

            File.WriteAllText(filePath, JsonHelper.ToJson(info));
        }
        private ExportInfo CreateExportInfo(CntkTrainDataSet trainData)
        {
            var info = new ExportInfo()
            {
                LabelDimension = trainData.LabelIndexes.Count,
                WordDimension  = trainData.WordSpace.DimensionSize,
            };

            return(info);
        }
        public CntkTrainDataSet Transform(DataSet dataSet)
        {
            var segmentator = new MeCabTextSegmentator();
            var topicStore  = new CntkTopicStore();
            var corpus      = new HashSet <string>();
            int sentenceId  = 0;

            var n = 0;
            //var count = sentences.Count;
            List <CntkSentence> sentences = dataSet.Select(row =>
            {
                n++;
                if (n % 10 == 0)
                {
                    Console.WriteLine(n + " sentences");
                }
                CntkTopic topic = topicStore.GetOrRegister(row.Label);
                // クレンジングをかけつつ単語収集
                List <string> words = segmentator.Split(row.Sentence);

                words.ForEach(w => corpus.Add(w));
                return(new CntkSentence()
                {
                    Id = sentenceId++,
                    Topic = topic,
                    Sentence = row.Sentence,
                    Words = words.Select(w => new CntkWord()
                    {
                        Text = w
                    }).ToList(),
                });
            }).ToList();

            // 取り込む単語が全て確定しないとVector表現が決まらないので、そこだけ最後
            var space = OneHotWordVectorSpace.Build(corpus);

            foreach (var sentence in sentences)
            {
                foreach (var word in sentence.Words)
                {
                    word.Value = space.ToVector(word.Text);
                }
            }

            var trainData = new CntkTrainDataSet(topicStore.Indexer, space, sentences);

            return(trainData);
        }
        public void Run()
        {
            // 学習データの取り込み
            string  dataSetFilePath = @"Resource\train_dataset.tsv";//context.BaseContext.TrainDataSetPath();
            DataSet data            = this.dataSetImport.Import(dataSetFilePath);

            // CNTKトレーニングデータ生成
            Console.WriteLine("reading data...");
            CntkTrainDataSet trainDataSet = this.trainDataSetTransform.Transform(data);

            // CNTKトレーニングデータ出力
            this.trainDataSetExport.Export(trainDataSet);

            //トレーニングデータサマリー出力
            this.summaryInfoExport.Export(trainDataSet);
        }
Exemple #5
0
        public void Export(CntkTrainDataSet trainData)
        {
            // trainning_data.tsv

            this.WriteTranInputData(trainData.Sentences);

            // label
            Console.WriteLine("creating label_index.tsv in folder : Data");
            string labelFile = @"Data\labels_index.tsv";

            this.WriteWordIndexStore(trainData.LabelIndexes, labelFile);

            // words
            Console.WriteLine("creating words_index.tsv in folder: Data");
            ITextIndexes indexes = (trainData.WordSpace).WordIndexes;

            if (indexes != null)
            {
                string wordsFilePath = @"Data\words_index.tsv";
                WriteWordIndexStore(indexes, wordsFilePath);
            }
        }