public static void GenerateFasttextPretrainingFileFromJsonDocuments(string answersDir, string answerUnitsFile, string pretrainingFilePath) { var documents = new AnswerStoreSimulator(answerUnitsFile, answersDir); using (StreamWriter sw = new StreamWriter(pretrainingFilePath, false, Encoding.UTF8)) { foreach (var paragraph in documents.AnswerUnits) { foreach (var title in paragraph.content.title) { if (title != "no-title") { sw.WriteLine(SentenceClassifier.PreprocessSentence(title)); } } sw.WriteLine(SentenceClassifier.PreprocessSentence(paragraph.content.plainText)); } } }
public static void GenerateFasttextTrainingFileFromCsvTable(string csvFilePath, string pretrainingFilePath) { if (File.Exists(csvFilePath)) { using (StreamReader sr = new StreamReader(csvFilePath, Encoding.UTF8)) using (StreamWriter sw = new StreamWriter(pretrainingFilePath, false, Encoding.UTF8)) { string line = null; while ((line = sr.ReadLine()) != null) { int intentIndex = line.LastIndexOf(','); if (intentIndex < 0) { throw new Exception("invalid file format"); } var question = line.Substring(0, intentIndex); sw.WriteLine(SentenceClassifier.PreprocessSentence(question)); } } } }
public static void GenerateFasttextTrainingFileFromCsvTable(string csvFilePath, int splitTrainingSets) { if (File.Exists(csvFilePath)) { Console.WriteLine("Reading file : " + csvFilePath + " ..."); int lineCount = 0; var questions = new List <LabelAndQuestion>(); using (StreamReader sr = new StreamReader(csvFilePath, Encoding.UTF8)) { string line = null; while ((line = sr.ReadLine()) != null) { int intentIndex = line.LastIndexOf(','); if (intentIndex < 0) { throw new Exception("invalid file format"); } var labelAndQuestion = new LabelAndQuestion(); labelAndQuestion.Question = line.Substring(0, intentIndex); labelAndQuestion.Label = line.Substring(intentIndex + 1); questions.Add(labelAndQuestion); } } Shuffle(questions); int bucketQuestionsCount = questions.Count / splitTrainingSets; string csvFileName = Path.GetFileNameWithoutExtension(csvFilePath); string csvFileDirectory = Path.GetDirectoryName(csvFilePath); for (int trainingSetNumber = 1; trainingSetNumber <= splitTrainingSets; trainingSetNumber++) { string trainingFilePath = csvFileDirectory + Path.DirectorySeparatorChar + csvFileName + trainingSetNumber + ".train"; string validationFilePath = csvFileDirectory + Path.DirectorySeparatorChar + csvFileName + trainingSetNumber + ".valid"; if (splitTrainingSets == 1) { trainingFilePath = csvFileDirectory + Path.DirectorySeparatorChar + csvFileName + ".delete"; validationFilePath = csvFileDirectory + Path.DirectorySeparatorChar + csvFileName + ".train"; } using (StreamWriter trainsw = new StreamWriter(trainingFilePath, false, Encoding.UTF8)) { using (StreamWriter validsw = new StreamWriter(validationFilePath, false, Encoding.UTF8)) { for (int questionIndex = 0; questionIndex < questions.Count; questionIndex++) { var labelAndQuestion = questions[questionIndex]; StringBuilder sbQuestion = new StringBuilder(); sbQuestion.Append(FASTTEXT_LABEL_PREFIX); sbQuestion.Append(labelAndQuestion.Label); sbQuestion.Append(' '); sbQuestion.Append(SentenceClassifier.PreprocessSentence(labelAndQuestion.Question)); bool writeToValidation = questionIndex >= (trainingSetNumber - 1) * bucketQuestionsCount && questionIndex < trainingSetNumber * bucketQuestionsCount; if (!writeToValidation) { trainsw.WriteLine(sbQuestion.ToString()); } else { validsw.WriteLine(sbQuestion.ToString()); } lineCount++; } } } Console.WriteLine("OK - " + lineCount + " training samples written to " + trainingFilePath); } } else { Console.WriteLine("ERROR : File " + csvFilePath + " doesn't exist"); } }