Exemple #1
0
        public string ShallowParse(string input)
        {
            var output = new StringBuilder();

            string[] sentences = _sentenceDetector.SentenceDetect(input);
            foreach (string sentence in sentences)
            {
                string[] tokens = _tokenizer.Tokenize(sentence);
                string[] tags   = _posTagger.Tag(tokens);
                output.Append(string.Join(" ", _chunker.GetChunks(tokens, tags)));
                output.Append("\r\n\r\n");
            }
            return(output.ToString());
        }
Exemple #2
0
        public string[] SplitSentences(string paragraph)
        {
            if (mSentenceDetector == null)
            {
                mSentenceDetector = new EnglishMaximumEntropySentenceDetector(mModelPath + @"\EnglishSD.nbin");
            }

            return(mSentenceDetector.SentenceDetect(paragraph));
        }
Exemple #3
0
        // NLP methods -------------------------------------------

        private string[] SplitSentences(string paragraph)
        {
            if (_sentenceDetector == null)
            {
                _sentenceDetector = new EnglishMaximumEntropySentenceDetector(_modelPath + "EnglishSD.nbin");
            }

            return(_sentenceDetector.SentenceDetect(paragraph));
        }
Exemple #4
0
        private string Lexicalize(string input)
        {
            var output = new StringBuilder();

            string[] sentences = _sentenceDetector.SentenceDetect(input);
            foreach (string sentence in sentences)
            {
                string[] tokens = _tokenizer.Tokenize(sentence);
                string[] tags   = _posTagger.Tag(tokens);
                for (int currentTag = 0; currentTag < tags.Length; currentTag++)
                {
                    if (!IsStopWord.ContainsKey(tokens[currentTag]))
                    {
                        if (tags[currentTag].StartsWith("NN"))
                        {
                            if (tags[currentTag] == "NNS")
                            {
                                output.Append(_wn.Lemmatize(tokens[currentTag], "noun")).Append(" ");
                            }
                            else
                            {
                                output.Append(tokens[currentTag]).Append(" ");
                            }
                        }
                        else
                        if (tags[currentTag].StartsWith("VB"))
                        {
                            if (tags[currentTag] != "VBP")
                            {
                                output.Append(_wn.Lemmatize(tokens[currentTag], "verb")).Append(" ");
                            }
                        }
                        else
                        if (tags[currentTag].StartsWith("JJ"))
                        {
                            output.Append(tokens[currentTag]).Append(" ");
                        }
                    }
                }
            }
            return(output.ToString());
        }
Exemple #5
0
 public string[] SplitTextToSentences(string text)
 {
     return(_maximumEntropySentenceDetector.SentenceDetect(text));
 }
Exemple #6
0
        private static void OptimizeSentenceDetectionTraining()
        {
            // all directories in Input folder
            var inputFolderPath = CurrentDirectory + "Input/";
            var allDirectories  = Directory.GetDirectories(inputFolderPath);

            Console.WriteLine("Pick the model to train:");
            for (var i = 0; i < allDirectories.Length; i++)
            {
                Console.WriteLine("{0} - {1}", i, Path.GetFileName(allDirectories[i]));
            }

            // read directory chosen by user
            int directoryIndexPicked = LoopUntilValidUserInput(input => int.Parse(input),
                                                               i => i < allDirectories.Length, string.Format("Please enter a number in [0..{0}]", allDirectories.Length - 1));

            // read user parameters
            Console.WriteLine("Enter the iteration values to test, separated by a comma (ex: 10,100,200)");
            var iterations = LoopUntilValidUserInput(input => input.Split(',').Select(s => int.Parse(s.Trim())).ToList(),
                                                     li => li != null && li.Any(),
                                                     "At least one iteration value is required");

            Console.WriteLine("Enter the cut values to test, separated by a comma (ex: 1,2,5)");
            var cuts = LoopUntilValidUserInput(input => input.Split(',').Select(s => int.Parse(s.Trim())).ToList(),
                                               li => li != null && li.Any(),
                                               "At least one cut value is required");

            // train model file
            var directory     = allDirectories[directoryIndexPicked];
            var allTrainFiles = Directory.GetFiles(directory, "*.train");

            Console.WriteLine("Training model with files {0}", string.Join(", ", allTrainFiles.Select(f => Path.GetFileNameWithoutExtension(f))));

            // load training data
            var allSentences = new List <string>();

            foreach (var file in allTrainFiles.Where(f => !f.Contains("wsj")))
            {
                allSentences.AddRange(File.ReadAllLines(file));
            }
            var testData = string.Join(" ", allSentences);

            var bestIterationValue = iterations.First();
            var bestCutValue       = iterations.First();
            var bestAccuracy       = 0.0d;

            var endOfSentenceScanner = new CharactersSpecificEndOfSentenceScanner('.', '?', '!', '"', '-', '…');

            foreach (var iteration in iterations)
            {
                foreach (var cut in cuts)
                {
                    var model = MaximumEntropySentenceDetector.TrainModel(allTrainFiles, iteration, cut, endOfSentenceScanner);
                    // compute accuracy
                    var sentenceDetector = new MaximumEntropySentenceDetector(model, endOfSentenceScanner);
                    var results          = sentenceDetector.SentenceDetect(testData);

                    // not perfect for comparing files but it gives a very good approximation
                    //var commonValues = allSentences.Intersect(results).Count();
                    var nbOfCommonSentences = 0;
                    foreach (var result in results)
                    {
                        if (allSentences.Contains(result))
                        {
                            nbOfCommonSentences++;
                        }
                        else
                        {
                            //Console.WriteLine(result);
                        }
                    }
                    var accuracyScore = (float)2 * nbOfCommonSentences / (allSentences.Count + results.Count());
                    Console.WriteLine("Accuracy for iteration={0} and cut={1}: {2}", iteration, cut, accuracyScore);
                    if (accuracyScore > bestAccuracy)
                    {
                        bestAccuracy       = accuracyScore;
                        bestIterationValue = iteration;
                        bestCutValue       = cut;
                    }
                }
            }

            // Persit model
            var outputFilePath = CurrentDirectory + "Output/" + Path.GetFileName(directory) + ".nbin";

            Console.WriteLine("Persisting model for iteration={0} and cut={1} to file '{2}'...", bestIterationValue, bestCutValue, outputFilePath);
            var bestModel = MaximumEntropySentenceDetector.TrainModel(allTrainFiles, bestIterationValue, bestCutValue, endOfSentenceScanner);

            new BinaryGisModelWriter().Persist(bestModel, outputFilePath);
            Console.WriteLine("Output file written.");
        }