示例#1
0
        public static void GenerateFasttextPretrainingFileFromJsonDocuments(string answersDir, string answerUnitsFile, string pretrainingFilePath)
        {
            var documents = new AnswerStoreSimulator(answerUnitsFile, answersDir);

            using (StreamWriter sw = new StreamWriter(pretrainingFilePath, false, Encoding.UTF8))
            {
                foreach (var paragraph in documents.AnswerUnits)
                {
                    foreach (var title in paragraph.content.title)
                    {
                        if (title != "no-title")
                        {
                            sw.WriteLine(SentenceClassifier.PreprocessSentence(title));
                        }
                    }
                    sw.WriteLine(SentenceClassifier.PreprocessSentence(paragraph.content.plainText));
                }
            }
        }
示例#2
0
        public static void GenerateFasttextTrainingFileFromCsvTable(string csvFilePath, string pretrainingFilePath)
        {
            if (File.Exists(csvFilePath))
            {
                using (StreamReader sr = new StreamReader(csvFilePath, Encoding.UTF8))
                    using (StreamWriter sw = new StreamWriter(pretrainingFilePath, false, Encoding.UTF8))
                    {
                        string line = null;
                        while ((line = sr.ReadLine()) != null)
                        {
                            int intentIndex = line.LastIndexOf(',');
                            if (intentIndex < 0)
                            {
                                throw new Exception("invalid file format");
                            }
                            var question = line.Substring(0, intentIndex);

                            sw.WriteLine(SentenceClassifier.PreprocessSentence(question));
                        }
                    }
            }
        }
示例#3
0
        public static void GeneratePerfAnalysisFile()
        {
            var WORKING_DIR  = @"..\fasttext\tests\";
            var DATASET_NAME = "SAV_NLC_100520171";
            var MODEL_NAME   = "model_savings1";

            string        trainingFilePath     = WORKING_DIR + DATASET_NAME + ".train";
            ISet <string> intentsSet           = new HashSet <string>();
            var           intentsCountTraining = new Dictionary <string, int>();
            var           trainingSamples      = new Dictionary <string, IList <string> >();

            using (StreamReader sr = new StreamReader(trainingFilePath, Encoding.UTF8))
            {
                string line = null;
                while ((line = sr.ReadLine()) != null)
                {
                    int    endOfLabel       = line.IndexOf(' ');
                    string intent           = line.Substring(9, endOfLabel - 9);
                    string trainingQuestion = line.Substring(endOfLabel + 1);

                    if (!intentsSet.Contains(intent))
                    {
                        intentsSet.Add(intent);
                        intentsCountTraining.Add(intent, 1);
                    }
                    else
                    {
                        intentsCountTraining[intent] += 1;
                    }
                    if (!trainingSamples.ContainsKey(intent))
                    {
                        trainingSamples.Add(intent, new List <string>());
                    }
                    trainingSamples[intent].Add(trainingQuestion);
                }
            }

            string validationFilePath = WORKING_DIR + DATASET_NAME + ".valid";
            IDictionary <string, string> annotatedQuestions = new Dictionary <string, string>();
            var intentsCountValidation = new Dictionary <string, int>();

            using (StreamReader sr = new StreamReader(validationFilePath, Encoding.UTF8))
            {
                string line = null;
                while ((line = sr.ReadLine()) != null)
                {
                    int    endOfLabel = line.IndexOf(' ');
                    string intent     = line.Substring(9, endOfLabel - 9);
                    string question   = line.Substring(endOfLabel + 1);

                    if (!intentsSet.Contains(intent))
                    {
                        intentsSet.Add(intent);
                    }
                    if (!intentsCountValidation.ContainsKey(intent))
                    {
                        intentsCountValidation.Add(intent, 1);
                    }
                    else
                    {
                        intentsCountValidation[intent] += 1;
                    }
                    if (!annotatedQuestions.ContainsKey(question))
                    {
                        annotatedQuestions.Add(question, intent);
                    }
                }
            }
            IList <string>            intents        = new List <string>(intentsSet.OrderBy(i => i));
            IDictionary <string, int> intentsIndexes = new Dictionary <string, int>();

            for (int i = 0; i < intents.Count; i++)
            {
                var intent = intents[i];
                intentsIndexes.Add(intent, i);
            }

            int[,] confusionMatrix = new int[intents.Count, intents.Count];
            var predictionResults = new List <PredictionResult>();

            using (SentenceClassifier classifier = new SentenceClassifier(WORKING_DIR + MODEL_NAME))
            {
                foreach (var question in annotatedQuestions.Keys)
                {
                    var annotatedIntent      = annotatedQuestions[question];
                    var annotatedIntentIndex = intentsIndexes[annotatedIntent];

                    var result = classifier.PredictLabels(question);

                    var predictionResult = new PredictionResult();
                    predictionResult.Question            = question;
                    predictionResult.ExpectedIntentIndex = annotatedIntentIndex;
                    predictionResult.IntentIndex1        = intentsIndexes[result.Label1];
                    predictionResult.IntentProba1        = result.Proba1;
                    if (result.Label2 != null)
                    {
                        predictionResult.IntentIndex2 = intentsIndexes[result.Label2];
                        predictionResult.IntentProba2 = result.Proba2;
                    }
                    predictionResults.Add(predictionResult);

                    confusionMatrix[annotatedIntentIndex, predictionResult.IntentIndex1]++;
                }
            }

            var intentsPerfs = new List <IntentPerf>();

            for (int intentIndex = 0; intentIndex < intents.Count; intentIndex++)
            {
                string intent         = intents[intentIndex];
                int    truePositives  = confusionMatrix[intentIndex, intentIndex];
                int    falsePositives = 0;
                for (int j = 0; j < intents.Count; j++)
                {
                    if (j != intentIndex)
                    {
                        falsePositives += confusionMatrix[j, intentIndex];
                    }
                }
                int falseNegatives = 0;
                for (int j = 0; j < intents.Count; j++)
                {
                    if (j != intentIndex)
                    {
                        falseNegatives += confusionMatrix[intentIndex, j];
                    }
                }

                var intentPerf = new IntentPerf();
                intentPerf.Intent = intent;
                if ((truePositives + falsePositives) > 0)
                {
                    intentPerf.Precision = (float)truePositives / (truePositives + falsePositives);
                }
                else
                {
                    intentPerf.Precision = -1;
                }
                if ((truePositives + falseNegatives) > 0)
                {
                    intentPerf.Recall = (float)truePositives / (truePositives + falseNegatives);
                }
                else
                {
                    intentPerf.Recall = -1;
                }
                if (intentPerf.Precision > 0 && intentPerf.Recall > 0)
                {
                    intentPerf.F1 = 2 * intentPerf.Precision * intentPerf.Recall / (intentPerf.Precision + intentPerf.Recall);
                }
                else
                {
                    intentPerf.F1 = -1;
                }
                intentsPerfs.Add(intentPerf);
            }

            var intentsConfusion = new List <IntentsPairConfusion>();

            for (int expectedIntentIndex = 0; expectedIntentIndex < intents.Count; expectedIntentIndex++)
            {
                for (int predictedIntentIndex = 0; predictedIntentIndex < expectedIntentIndex; predictedIntentIndex++)
                {
                    var intentsPair = new IntentsPairConfusion();
                    intentsPair.IntentIndex1    = expectedIntentIndex;
                    intentsPair.IntentIndex2    = predictedIntentIndex;
                    intentsPair.Expected1Found1 = confusionMatrix[expectedIntentIndex, expectedIntentIndex];
                    intentsPair.Expected1Found2 = confusionMatrix[expectedIntentIndex, predictedIntentIndex];
                    intentsPair.Expected2Found1 = confusionMatrix[predictedIntentIndex, expectedIntentIndex];
                    intentsPair.Expected2Found2 = confusionMatrix[predictedIntentIndex, predictedIntentIndex];
                    intentsPair.ConfusionErrors = intentsPair.Expected1Found2 + intentsPair.Expected2Found1;
                    if (intentsPair.Expected1Found1 > 0)
                    {
                        intentsPair.ConfusionRate1To2 = (float)intentsPair.Expected1Found2 / intentsPair.Expected1Found1;
                    }
                    else if (intentsPair.Expected1Found2 > 0)
                    {
                        intentsPair.ConfusionRate1To2 = 1;
                    }
                    if (intentsPair.Expected2Found2 > 0)
                    {
                        intentsPair.ConfusionRate2To1 = (float)intentsPair.Expected2Found1 / intentsPair.Expected2Found2;
                    }
                    else if (intentsPair.Expected2Found1 > 0)
                    {
                        intentsPair.ConfusionRate2To1 = 1;
                    }
                    intentsConfusion.Add(intentsPair);
                }
            }

            using (StreamWriter sw = new StreamWriter(@"..\fasttext\" + MODEL_NAME + ".results.csv", false, Encoding.GetEncoding("iso8859-1")))
            {
                sw.WriteLine("1. Intents performance");
                sw.WriteLine();
                sw.WriteLine("Intent;# Training;# Validation;F1;Precision;Recall");
                foreach (var intentPerf in intentsPerfs.OrderByDescending(perf => perf.F1))
                {
                    var trainingCount = 0;
                    intentsCountTraining.TryGetValue(intentPerf.Intent, out trainingCount);
                    var validationCount = 0;
                    intentsCountValidation.TryGetValue(intentPerf.Intent, out validationCount);

                    sw.Write(intentPerf.Intent);
                    sw.Write(';');
                    sw.Write(trainingCount);
                    sw.Write(';');
                    sw.Write(validationCount);
                    sw.Write(';');
                    sw.Write(intentPerf.F1 < 0 ? "N/A" : intentPerf.F1.ToString("N2"));
                    sw.Write(';');
                    sw.Write(intentPerf.Precision < 0 ? "N/A" : intentPerf.Precision.ToString("N2"));
                    sw.Write(';');
                    sw.Write(intentPerf.Recall < 0 ? "N/A" : intentPerf.Recall.ToString("N2"));
                    sw.WriteLine();
                }
                sw.WriteLine();

                sw.WriteLine("2. Intents confusion (top 50)");
                sw.WriteLine();
                sw.WriteLine("Intent 1;Intent 2;# Confusions;# Confusion 1>2;% Confusion 1>2;# Confusion 2>1;% Confusion 2>1");
                foreach (var intentPair in intentsConfusion.OrderByDescending(pair => pair.ConfusionErrors).Take(50))
                {
                    sw.Write(intents[intentPair.IntentIndex1]);
                    sw.Write(';');
                    sw.Write(intents[intentPair.IntentIndex2]);
                    sw.Write(';');
                    sw.Write(intentPair.ConfusionErrors);
                    sw.Write(';');
                    sw.Write(intentPair.Expected1Found2);
                    sw.Write(';');
                    sw.Write(intentPair.ConfusionRate1To2.ToString("N2"));
                    sw.Write(';');
                    sw.Write(intentPair.Expected2Found1);
                    sw.Write(';');
                    sw.Write(intentPair.ConfusionRate2To1.ToString("N2"));
                    sw.WriteLine();
                }
                sw.WriteLine();

                sw.WriteLine();
                sw.WriteLine();

                sw.WriteLine("3. Detailed error analysis for each intent");
                sw.WriteLine();
                for (int expectedIntentIndex = 0; expectedIntentIndex < intents.Count; expectedIntentIndex++)
                {
                    var expectedIntent = intents[expectedIntentIndex];

                    sw.WriteLine(">>> " + expectedIntent);
                    sw.WriteLine();

                    sw.WriteLine("Training set questions");
                    if (trainingSamples.ContainsKey(expectedIntent))
                    {
                        foreach (var trainingQuestion in trainingSamples[expectedIntent])
                        {
                            sw.WriteLine(trainingQuestion);
                        }
                    }
                    else
                    {
                        sw.WriteLine("-- NONE --");
                    }
                    sw.WriteLine();

                    sw.WriteLine("Questions correctly classified in this class");
                    foreach (var predictionResult in predictionResults.Where(pred => pred.ExpectedIntentIndex == expectedIntentIndex && pred.IntentIndex1 == expectedIntentIndex).OrderByDescending(pred => pred.IntentProba1))
                    {
                        sw.Write(predictionResult.Question.Replace(';', ' '));
                        sw.Write(';');
                        sw.Write(predictionResult.IntentProba1);
                        sw.Write(';');
                        sw.Write(predictionResult.IntentProba1 - predictionResult.IntentProba2);
                        sw.WriteLine();
                    }
                    sw.WriteLine();

                    sw.WriteLine("Questions from this class incorrectly classified in another class");
                    foreach (var predictionResult in predictionResults.Where(pred => pred.ExpectedIntentIndex == expectedIntentIndex && pred.IntentIndex1 != expectedIntentIndex).OrderBy(pred => pred.IntentIndex1).ThenByDescending(pred => pred.IntentProba1))
                    {
                        sw.Write(predictionResult.Question.Replace(';', ' '));
                        sw.Write(';');
                        sw.Write(intents[predictionResult.IntentIndex1]);
                        sw.Write(';');
                        sw.Write(predictionResult.IntentProba1);
                        sw.Write(';');
                        sw.Write(predictionResult.IntentProba1 - predictionResult.IntentProba2);
                        sw.Write(';');
                        sw.Write(intents[predictionResult.IntentIndex2]);
                        sw.Write(';');
                        sw.Write(predictionResult.IntentProba2);
                        sw.WriteLine();
                    }
                    sw.WriteLine();

                    sw.WriteLine("Questions from another class incorrectly classified in this class");
                    foreach (var predictionResult in predictionResults.Where(pred => pred.ExpectedIntentIndex != expectedIntentIndex && pred.IntentIndex1 == expectedIntentIndex).OrderBy(pred => pred.ExpectedIntentIndex).ThenByDescending(pred => pred.IntentProba1))
                    {
                        sw.Write(predictionResult.Question.Replace(';', ' '));
                        sw.Write(';');
                        sw.Write(intents[predictionResult.ExpectedIntentIndex]);
                        sw.Write(';');
                        sw.Write(predictionResult.IntentProba1);
                        sw.Write(';');
                        sw.Write(predictionResult.IntentProba1 - predictionResult.IntentProba2);
                        sw.Write(';');
                        sw.Write(intents[predictionResult.IntentIndex2]);
                        sw.Write(';');
                        sw.Write(predictionResult.IntentProba2);
                        sw.WriteLine();
                    }
                    sw.WriteLine();
                }
            }
        }
        public static void GenerateFasttextTrainingFileFromCsvTable(string csvFilePath, int splitTrainingSets)
        {
            if (File.Exists(csvFilePath))
            {
                Console.WriteLine("Reading file : " + csvFilePath + " ...");
                int lineCount = 0;
                var questions = new List <LabelAndQuestion>();
                using (StreamReader sr = new StreamReader(csvFilePath, Encoding.UTF8))
                {
                    string line = null;
                    while ((line = sr.ReadLine()) != null)
                    {
                        int intentIndex = line.LastIndexOf(',');
                        if (intentIndex < 0)
                        {
                            throw new Exception("invalid file format");
                        }
                        var labelAndQuestion = new LabelAndQuestion();
                        labelAndQuestion.Question = line.Substring(0, intentIndex);
                        labelAndQuestion.Label    = line.Substring(intentIndex + 1);
                        questions.Add(labelAndQuestion);
                    }
                }
                Shuffle(questions);

                int bucketQuestionsCount = questions.Count / splitTrainingSets;

                string csvFileName      = Path.GetFileNameWithoutExtension(csvFilePath);
                string csvFileDirectory = Path.GetDirectoryName(csvFilePath);
                for (int trainingSetNumber = 1; trainingSetNumber <= splitTrainingSets; trainingSetNumber++)
                {
                    string trainingFilePath   = csvFileDirectory + Path.DirectorySeparatorChar + csvFileName + trainingSetNumber + ".train";
                    string validationFilePath = csvFileDirectory + Path.DirectorySeparatorChar + csvFileName + trainingSetNumber + ".valid";
                    if (splitTrainingSets == 1)
                    {
                        trainingFilePath   = csvFileDirectory + Path.DirectorySeparatorChar + csvFileName + ".delete";
                        validationFilePath = csvFileDirectory + Path.DirectorySeparatorChar + csvFileName + ".train";
                    }
                    using (StreamWriter trainsw = new StreamWriter(trainingFilePath, false, Encoding.UTF8))
                    {
                        using (StreamWriter validsw = new StreamWriter(validationFilePath, false, Encoding.UTF8))
                        {
                            for (int questionIndex = 0; questionIndex < questions.Count; questionIndex++)
                            {
                                var labelAndQuestion = questions[questionIndex];

                                StringBuilder sbQuestion = new StringBuilder();
                                sbQuestion.Append(FASTTEXT_LABEL_PREFIX);
                                sbQuestion.Append(labelAndQuestion.Label);
                                sbQuestion.Append(' ');
                                sbQuestion.Append(SentenceClassifier.PreprocessSentence(labelAndQuestion.Question));

                                bool writeToValidation = questionIndex >= (trainingSetNumber - 1) * bucketQuestionsCount && questionIndex < trainingSetNumber * bucketQuestionsCount;
                                if (!writeToValidation)
                                {
                                    trainsw.WriteLine(sbQuestion.ToString());
                                }
                                else
                                {
                                    validsw.WriteLine(sbQuestion.ToString());
                                }
                                lineCount++;
                            }
                        }
                    }
                    Console.WriteLine("OK - " + lineCount + " training samples written to " + trainingFilePath);
                }
            }
            else
            {
                Console.WriteLine("ERROR : File " + csvFilePath + " doesn't exist");
            }
        }