public string ShallowParse(string input) { var output = new StringBuilder(); string[] sentences = _sentenceDetector.SentenceDetect(input); foreach (string sentence in sentences) { string[] tokens = _tokenizer.Tokenize(sentence); string[] tags = _posTagger.Tag(tokens); output.Append(string.Join(" ", _chunker.GetChunks(tokens, tags))); output.Append("\r\n\r\n"); } return(output.ToString()); }
public string[] SplitSentences(string paragraph) { if (mSentenceDetector == null) { mSentenceDetector = new EnglishMaximumEntropySentenceDetector(mModelPath + @"\EnglishSD.nbin"); } return(mSentenceDetector.SentenceDetect(paragraph)); }
// NLP methods ------------------------------------------- private string[] SplitSentences(string paragraph) { if (_sentenceDetector == null) { _sentenceDetector = new EnglishMaximumEntropySentenceDetector(_modelPath + "EnglishSD.nbin"); } return(_sentenceDetector.SentenceDetect(paragraph)); }
private string Lexicalize(string input) { var output = new StringBuilder(); string[] sentences = _sentenceDetector.SentenceDetect(input); foreach (string sentence in sentences) { string[] tokens = _tokenizer.Tokenize(sentence); string[] tags = _posTagger.Tag(tokens); for (int currentTag = 0; currentTag < tags.Length; currentTag++) { if (!IsStopWord.ContainsKey(tokens[currentTag])) { if (tags[currentTag].StartsWith("NN")) { if (tags[currentTag] == "NNS") { output.Append(_wn.Lemmatize(tokens[currentTag], "noun")).Append(" "); } else { output.Append(tokens[currentTag]).Append(" "); } } else if (tags[currentTag].StartsWith("VB")) { if (tags[currentTag] != "VBP") { output.Append(_wn.Lemmatize(tokens[currentTag], "verb")).Append(" "); } } else if (tags[currentTag].StartsWith("JJ")) { output.Append(tokens[currentTag]).Append(" "); } } } } return(output.ToString()); }
public string[] SplitTextToSentences(string text) { return(_maximumEntropySentenceDetector.SentenceDetect(text)); }
private static void OptimizeSentenceDetectionTraining() { // all directories in Input folder var inputFolderPath = CurrentDirectory + "Input/"; var allDirectories = Directory.GetDirectories(inputFolderPath); Console.WriteLine("Pick the model to train:"); for (var i = 0; i < allDirectories.Length; i++) { Console.WriteLine("{0} - {1}", i, Path.GetFileName(allDirectories[i])); } // read directory chosen by user int directoryIndexPicked = LoopUntilValidUserInput(input => int.Parse(input), i => i < allDirectories.Length, string.Format("Please enter a number in [0..{0}]", allDirectories.Length - 1)); // read user parameters Console.WriteLine("Enter the iteration values to test, separated by a comma (ex: 10,100,200)"); var iterations = LoopUntilValidUserInput(input => input.Split(',').Select(s => int.Parse(s.Trim())).ToList(), li => li != null && li.Any(), "At least one iteration value is required"); Console.WriteLine("Enter the cut values to test, separated by a comma (ex: 1,2,5)"); var cuts = LoopUntilValidUserInput(input => input.Split(',').Select(s => int.Parse(s.Trim())).ToList(), li => li != null && li.Any(), "At least one cut value is required"); // train model file var directory = allDirectories[directoryIndexPicked]; var allTrainFiles = Directory.GetFiles(directory, "*.train"); Console.WriteLine("Training model with files {0}", string.Join(", ", allTrainFiles.Select(f => Path.GetFileNameWithoutExtension(f)))); // load training data var allSentences = new List <string>(); foreach (var file in allTrainFiles.Where(f => !f.Contains("wsj"))) { allSentences.AddRange(File.ReadAllLines(file)); } var testData = string.Join(" ", allSentences); var bestIterationValue = iterations.First(); var bestCutValue = iterations.First(); var bestAccuracy = 0.0d; var endOfSentenceScanner = new CharactersSpecificEndOfSentenceScanner('.', '?', '!', '"', '-', '…'); foreach (var iteration in iterations) { foreach (var cut in cuts) { var model = MaximumEntropySentenceDetector.TrainModel(allTrainFiles, iteration, cut, endOfSentenceScanner); // compute accuracy var sentenceDetector = new MaximumEntropySentenceDetector(model, endOfSentenceScanner); var results = sentenceDetector.SentenceDetect(testData); // not perfect for comparing files but it gives a very good approximation //var commonValues = allSentences.Intersect(results).Count(); var nbOfCommonSentences = 0; foreach (var result in results) { if (allSentences.Contains(result)) { nbOfCommonSentences++; } else { //Console.WriteLine(result); } } var accuracyScore = (float)2 * nbOfCommonSentences / (allSentences.Count + results.Count()); Console.WriteLine("Accuracy for iteration={0} and cut={1}: {2}", iteration, cut, accuracyScore); if (accuracyScore > bestAccuracy) { bestAccuracy = accuracyScore; bestIterationValue = iteration; bestCutValue = cut; } } } // Persit model var outputFilePath = CurrentDirectory + "Output/" + Path.GetFileName(directory) + ".nbin"; Console.WriteLine("Persisting model for iteration={0} and cut={1} to file '{2}'...", bestIterationValue, bestCutValue, outputFilePath); var bestModel = MaximumEntropySentenceDetector.TrainModel(allTrainFiles, bestIterationValue, bestCutValue, endOfSentenceScanner); new BinaryGisModelWriter().Persist(bestModel, outputFilePath); Console.WriteLine("Output file written."); }