public static void GenerateFasttextPretrainingFileFromJsonDocuments(string answersDir, string answerUnitsFile, string pretrainingFilePath) { var documents = new AnswerStoreSimulator(answerUnitsFile, answersDir); using (StreamWriter sw = new StreamWriter(pretrainingFilePath, false, Encoding.UTF8)) { foreach (var paragraph in documents.AnswerUnits) { foreach (var title in paragraph.content.title) { if (title != "no-title") { sw.WriteLine(SentenceClassifier.PreprocessSentence(title)); } } sw.WriteLine(SentenceClassifier.PreprocessSentence(paragraph.content.plainText)); } } }
public static void GenerateFasttextTrainingFileFromCsvTable(string csvFilePath, string pretrainingFilePath) { if (File.Exists(csvFilePath)) { using (StreamReader sr = new StreamReader(csvFilePath, Encoding.UTF8)) using (StreamWriter sw = new StreamWriter(pretrainingFilePath, false, Encoding.UTF8)) { string line = null; while ((line = sr.ReadLine()) != null) { int intentIndex = line.LastIndexOf(','); if (intentIndex < 0) { throw new Exception("invalid file format"); } var question = line.Substring(0, intentIndex); sw.WriteLine(SentenceClassifier.PreprocessSentence(question)); } } } }
public static void GeneratePerfAnalysisFile() { var WORKING_DIR = @"..\fasttext\tests\"; var DATASET_NAME = "SAV_NLC_100520171"; var MODEL_NAME = "model_savings1"; string trainingFilePath = WORKING_DIR + DATASET_NAME + ".train"; ISet <string> intentsSet = new HashSet <string>(); var intentsCountTraining = new Dictionary <string, int>(); var trainingSamples = new Dictionary <string, IList <string> >(); using (StreamReader sr = new StreamReader(trainingFilePath, Encoding.UTF8)) { string line = null; while ((line = sr.ReadLine()) != null) { int endOfLabel = line.IndexOf(' '); string intent = line.Substring(9, endOfLabel - 9); string trainingQuestion = line.Substring(endOfLabel + 1); if (!intentsSet.Contains(intent)) { intentsSet.Add(intent); intentsCountTraining.Add(intent, 1); } else { intentsCountTraining[intent] += 1; } if (!trainingSamples.ContainsKey(intent)) { trainingSamples.Add(intent, new List <string>()); } trainingSamples[intent].Add(trainingQuestion); } } string validationFilePath = WORKING_DIR + DATASET_NAME + ".valid"; IDictionary <string, string> annotatedQuestions = new Dictionary <string, string>(); var intentsCountValidation = new Dictionary <string, int>(); using (StreamReader sr = new StreamReader(validationFilePath, Encoding.UTF8)) { string line = null; while ((line = sr.ReadLine()) != null) { int endOfLabel = line.IndexOf(' '); string intent = line.Substring(9, endOfLabel - 9); string question = line.Substring(endOfLabel + 1); if (!intentsSet.Contains(intent)) { intentsSet.Add(intent); } if (!intentsCountValidation.ContainsKey(intent)) { intentsCountValidation.Add(intent, 1); } else { intentsCountValidation[intent] += 1; } if (!annotatedQuestions.ContainsKey(question)) { annotatedQuestions.Add(question, intent); } } } IList <string> intents = new List <string>(intentsSet.OrderBy(i => i)); IDictionary <string, int> intentsIndexes = new Dictionary <string, int>(); for (int i = 0; i < intents.Count; i++) { var intent = intents[i]; intentsIndexes.Add(intent, i); } int[,] confusionMatrix = new int[intents.Count, intents.Count]; var predictionResults = new List <PredictionResult>(); using (SentenceClassifier classifier = new SentenceClassifier(WORKING_DIR + MODEL_NAME)) { foreach (var question in annotatedQuestions.Keys) { var annotatedIntent = annotatedQuestions[question]; var annotatedIntentIndex = intentsIndexes[annotatedIntent]; var result = classifier.PredictLabels(question); var predictionResult = new PredictionResult(); predictionResult.Question = question; predictionResult.ExpectedIntentIndex = annotatedIntentIndex; predictionResult.IntentIndex1 = intentsIndexes[result.Label1]; predictionResult.IntentProba1 = result.Proba1; if (result.Label2 != null) { predictionResult.IntentIndex2 = intentsIndexes[result.Label2]; predictionResult.IntentProba2 = result.Proba2; } predictionResults.Add(predictionResult); confusionMatrix[annotatedIntentIndex, predictionResult.IntentIndex1]++; } } var intentsPerfs = new List <IntentPerf>(); for (int intentIndex = 0; intentIndex < intents.Count; intentIndex++) { string intent = intents[intentIndex]; int truePositives = confusionMatrix[intentIndex, intentIndex]; int falsePositives = 0; for (int j = 0; j < intents.Count; j++) { if (j != intentIndex) { falsePositives += confusionMatrix[j, intentIndex]; } } int falseNegatives = 0; for (int j = 0; j < intents.Count; j++) { if (j != intentIndex) { falseNegatives += confusionMatrix[intentIndex, j]; } } var intentPerf = new IntentPerf(); intentPerf.Intent = intent; if ((truePositives + falsePositives) > 0) { intentPerf.Precision = (float)truePositives / (truePositives + falsePositives); } else { intentPerf.Precision = -1; } if ((truePositives + falseNegatives) > 0) { intentPerf.Recall = (float)truePositives / (truePositives + falseNegatives); } else { intentPerf.Recall = -1; } if (intentPerf.Precision > 0 && intentPerf.Recall > 0) { intentPerf.F1 = 2 * intentPerf.Precision * intentPerf.Recall / (intentPerf.Precision + intentPerf.Recall); } else { intentPerf.F1 = -1; } intentsPerfs.Add(intentPerf); } var intentsConfusion = new List <IntentsPairConfusion>(); for (int expectedIntentIndex = 0; expectedIntentIndex < intents.Count; expectedIntentIndex++) { for (int predictedIntentIndex = 0; predictedIntentIndex < expectedIntentIndex; predictedIntentIndex++) { var intentsPair = new IntentsPairConfusion(); intentsPair.IntentIndex1 = expectedIntentIndex; intentsPair.IntentIndex2 = predictedIntentIndex; intentsPair.Expected1Found1 = confusionMatrix[expectedIntentIndex, expectedIntentIndex]; intentsPair.Expected1Found2 = confusionMatrix[expectedIntentIndex, predictedIntentIndex]; intentsPair.Expected2Found1 = confusionMatrix[predictedIntentIndex, expectedIntentIndex]; intentsPair.Expected2Found2 = confusionMatrix[predictedIntentIndex, predictedIntentIndex]; intentsPair.ConfusionErrors = intentsPair.Expected1Found2 + intentsPair.Expected2Found1; if (intentsPair.Expected1Found1 > 0) { intentsPair.ConfusionRate1To2 = (float)intentsPair.Expected1Found2 / intentsPair.Expected1Found1; } else if (intentsPair.Expected1Found2 > 0) { intentsPair.ConfusionRate1To2 = 1; } if (intentsPair.Expected2Found2 > 0) { intentsPair.ConfusionRate2To1 = (float)intentsPair.Expected2Found1 / intentsPair.Expected2Found2; } else if (intentsPair.Expected2Found1 > 0) { intentsPair.ConfusionRate2To1 = 1; } intentsConfusion.Add(intentsPair); } } using (StreamWriter sw = new StreamWriter(@"..\fasttext\" + MODEL_NAME + ".results.csv", false, Encoding.GetEncoding("iso8859-1"))) { sw.WriteLine("1. Intents performance"); sw.WriteLine(); sw.WriteLine("Intent;# Training;# Validation;F1;Precision;Recall"); foreach (var intentPerf in intentsPerfs.OrderByDescending(perf => perf.F1)) { var trainingCount = 0; intentsCountTraining.TryGetValue(intentPerf.Intent, out trainingCount); var validationCount = 0; intentsCountValidation.TryGetValue(intentPerf.Intent, out validationCount); sw.Write(intentPerf.Intent); sw.Write(';'); sw.Write(trainingCount); sw.Write(';'); sw.Write(validationCount); sw.Write(';'); sw.Write(intentPerf.F1 < 0 ? "N/A" : intentPerf.F1.ToString("N2")); sw.Write(';'); sw.Write(intentPerf.Precision < 0 ? "N/A" : intentPerf.Precision.ToString("N2")); sw.Write(';'); sw.Write(intentPerf.Recall < 0 ? "N/A" : intentPerf.Recall.ToString("N2")); sw.WriteLine(); } sw.WriteLine(); sw.WriteLine("2. Intents confusion (top 50)"); sw.WriteLine(); sw.WriteLine("Intent 1;Intent 2;# Confusions;# Confusion 1>2;% Confusion 1>2;# Confusion 2>1;% Confusion 2>1"); foreach (var intentPair in intentsConfusion.OrderByDescending(pair => pair.ConfusionErrors).Take(50)) { sw.Write(intents[intentPair.IntentIndex1]); sw.Write(';'); sw.Write(intents[intentPair.IntentIndex2]); sw.Write(';'); sw.Write(intentPair.ConfusionErrors); sw.Write(';'); sw.Write(intentPair.Expected1Found2); sw.Write(';'); sw.Write(intentPair.ConfusionRate1To2.ToString("N2")); sw.Write(';'); sw.Write(intentPair.Expected2Found1); sw.Write(';'); sw.Write(intentPair.ConfusionRate2To1.ToString("N2")); sw.WriteLine(); } sw.WriteLine(); sw.WriteLine(); sw.WriteLine(); sw.WriteLine("3. Detailed error analysis for each intent"); sw.WriteLine(); for (int expectedIntentIndex = 0; expectedIntentIndex < intents.Count; expectedIntentIndex++) { var expectedIntent = intents[expectedIntentIndex]; sw.WriteLine(">>> " + expectedIntent); sw.WriteLine(); sw.WriteLine("Training set questions"); if (trainingSamples.ContainsKey(expectedIntent)) { foreach (var trainingQuestion in trainingSamples[expectedIntent]) { sw.WriteLine(trainingQuestion); } } else { sw.WriteLine("-- NONE --"); } sw.WriteLine(); sw.WriteLine("Questions correctly classified in this class"); foreach (var predictionResult in predictionResults.Where(pred => pred.ExpectedIntentIndex == expectedIntentIndex && pred.IntentIndex1 == expectedIntentIndex).OrderByDescending(pred => pred.IntentProba1)) { sw.Write(predictionResult.Question.Replace(';', ' ')); sw.Write(';'); sw.Write(predictionResult.IntentProba1); sw.Write(';'); sw.Write(predictionResult.IntentProba1 - predictionResult.IntentProba2); sw.WriteLine(); } sw.WriteLine(); sw.WriteLine("Questions from this class incorrectly classified in another class"); foreach (var predictionResult in predictionResults.Where(pred => pred.ExpectedIntentIndex == expectedIntentIndex && pred.IntentIndex1 != expectedIntentIndex).OrderBy(pred => pred.IntentIndex1).ThenByDescending(pred => pred.IntentProba1)) { sw.Write(predictionResult.Question.Replace(';', ' ')); sw.Write(';'); sw.Write(intents[predictionResult.IntentIndex1]); sw.Write(';'); sw.Write(predictionResult.IntentProba1); sw.Write(';'); sw.Write(predictionResult.IntentProba1 - predictionResult.IntentProba2); sw.Write(';'); sw.Write(intents[predictionResult.IntentIndex2]); sw.Write(';'); sw.Write(predictionResult.IntentProba2); sw.WriteLine(); } sw.WriteLine(); sw.WriteLine("Questions from another class incorrectly classified in this class"); foreach (var predictionResult in predictionResults.Where(pred => pred.ExpectedIntentIndex != expectedIntentIndex && pred.IntentIndex1 == expectedIntentIndex).OrderBy(pred => pred.ExpectedIntentIndex).ThenByDescending(pred => pred.IntentProba1)) { sw.Write(predictionResult.Question.Replace(';', ' ')); sw.Write(';'); sw.Write(intents[predictionResult.ExpectedIntentIndex]); sw.Write(';'); sw.Write(predictionResult.IntentProba1); sw.Write(';'); sw.Write(predictionResult.IntentProba1 - predictionResult.IntentProba2); sw.Write(';'); sw.Write(intents[predictionResult.IntentIndex2]); sw.Write(';'); sw.Write(predictionResult.IntentProba2); sw.WriteLine(); } sw.WriteLine(); } } }
public static void GenerateFasttextTrainingFileFromCsvTable(string csvFilePath, int splitTrainingSets) { if (File.Exists(csvFilePath)) { Console.WriteLine("Reading file : " + csvFilePath + " ..."); int lineCount = 0; var questions = new List <LabelAndQuestion>(); using (StreamReader sr = new StreamReader(csvFilePath, Encoding.UTF8)) { string line = null; while ((line = sr.ReadLine()) != null) { int intentIndex = line.LastIndexOf(','); if (intentIndex < 0) { throw new Exception("invalid file format"); } var labelAndQuestion = new LabelAndQuestion(); labelAndQuestion.Question = line.Substring(0, intentIndex); labelAndQuestion.Label = line.Substring(intentIndex + 1); questions.Add(labelAndQuestion); } } Shuffle(questions); int bucketQuestionsCount = questions.Count / splitTrainingSets; string csvFileName = Path.GetFileNameWithoutExtension(csvFilePath); string csvFileDirectory = Path.GetDirectoryName(csvFilePath); for (int trainingSetNumber = 1; trainingSetNumber <= splitTrainingSets; trainingSetNumber++) { string trainingFilePath = csvFileDirectory + Path.DirectorySeparatorChar + csvFileName + trainingSetNumber + ".train"; string validationFilePath = csvFileDirectory + Path.DirectorySeparatorChar + csvFileName + trainingSetNumber + ".valid"; if (splitTrainingSets == 1) { trainingFilePath = csvFileDirectory + Path.DirectorySeparatorChar + csvFileName + ".delete"; validationFilePath = csvFileDirectory + Path.DirectorySeparatorChar + csvFileName + ".train"; } using (StreamWriter trainsw = new StreamWriter(trainingFilePath, false, Encoding.UTF8)) { using (StreamWriter validsw = new StreamWriter(validationFilePath, false, Encoding.UTF8)) { for (int questionIndex = 0; questionIndex < questions.Count; questionIndex++) { var labelAndQuestion = questions[questionIndex]; StringBuilder sbQuestion = new StringBuilder(); sbQuestion.Append(FASTTEXT_LABEL_PREFIX); sbQuestion.Append(labelAndQuestion.Label); sbQuestion.Append(' '); sbQuestion.Append(SentenceClassifier.PreprocessSentence(labelAndQuestion.Question)); bool writeToValidation = questionIndex >= (trainingSetNumber - 1) * bucketQuestionsCount && questionIndex < trainingSetNumber * bucketQuestionsCount; if (!writeToValidation) { trainsw.WriteLine(sbQuestion.ToString()); } else { validsw.WriteLine(sbQuestion.ToString()); } lineCount++; } } } Console.WriteLine("OK - " + lineCount + " training samples written to " + trainingFilePath); } } else { Console.WriteLine("ERROR : File " + csvFilePath + " doesn't exist"); } }