/// <summary> /// Split the input content to individual words /// </summary> /// <param name="contents">Content to split into words</param> /// <returns></returns> public static IEnumerable <string> ExtractSentences(string contents) { var sentenizer = new SentenceDetectorME(Model); var sentences = sentenizer.sentDetect(contents); return(sentences); }
static void Main(string[] args) { String[] file = Directory.GetFiles(@"..\..\..\", "*.txt"); StreamWriter sw = new StreamWriter(@"..\..\..\output\Html.txt"); foreach (String files in file) { using (StreamReader sr = new StreamReader(files)) { while (sr.Peek() != -1) { string line = sr.ReadLine(); int i; java.io.InputStream modelin = new java.io.FileInputStream(string.Format(@"{0}\en-sent.bin", @"..\Debug")); SentenceModel model = new SentenceModel(modelin); SentenceDetector detector = new SentenceDetectorME(model); string[] sents = detector.sentDetect(line); foreach (var sent in sents) { sw.WriteLine(sent); } } } sw.Flush(); } sw.Close(); }
/// <summary> /// Split the input content to individual words /// </summary> /// <param name="contents">Content to split into words</param> /// <returns></returns> public static IEnumerable<string> ExtractSentences(string contents) { var sentenizer = new SentenceDetectorME(Model); var sentences = sentenizer.sentDetect(contents); return sentences; }
public void TestEverything() { using (var file = Tests.OpenFile("/opennlp/tools/sentdetect/Sentences.txt")) { var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); var sdFactory = new SentenceDetectorFactory("en", true, null, null); var stream = new SentenceSampleStream(new PlainTextByLineStream(file)); var model = SentenceDetectorME.Train("en", stream, sdFactory, mlParams); Assert.AreEqual("en", model.Language); Assert.AreEqual(model.UseTokenEnd, true); var sMe = new SentenceDetectorME(model); // test the SharpNL sentences SentenceDetectorMETest.EvalSentences(sMe); var sFile = Path.GetTempFileName(); model.Serialize(new FileStream(sFile, FileMode.Create)); var jModel2 = new JavaModel(OpenNLP.CreateInputStream(sFile)); var jMe = new JavaSDME(jModel2); // test the Java OpenNLP sentences. JavaEvalSentences(jMe); // first try?! Yes! ;-) } }
private StringBuilder ReverseIt(string Message) { StringBuilder reversedString = new StringBuilder(); //lets do some language processing tasks to identify sentence structure SentenceDetectorME sentenceParser = new SentenceDetectorME(LoadNLP.sentenceModel); TokenizerME tokenizer = new TokenizerME(LoadNLP.tokenModel); string[] sentences = sentenceParser.sentDetect(Message); //iterate through each sentence foreach (string sentence in sentences) { string[] tokens = tokenizer.tokenize(sentence); //reverse the tokens for (int i = 0; i < tokens.Length / 2; i++) { string storage = tokens[i]; tokens[i] = tokens[tokens.Length - i - 1]; tokens[tokens.Length - i - 1] = storage; } //Now that we've organized the sentence nicely, lets detokenize it and convert back to a usable string reversedString.Append(DeTokenize(tokens, DetokenizationDictionary.Operation.MOVE_LEFT)); } return(reversedString); }
public override void run(string format, string[] args) { base.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(@params.Params, false); if (mlParams != null) { if (TrainUtil.isSequenceTraining(mlParams.Settings)) { throw new TerminateToolException(1, "Sequence training is not supported!"); } } if (mlParams == null) { mlParams = ModelUtil.createTrainingParameters(@params.Iterations.Value, @params.Cutoff.Value); } Jfile modelOutFile = @params.Model; CmdLineUtil.checkOutputFile("sentence detector model", modelOutFile); char[] eos = null; if (@params.EosChars != null) { eos = @params.EosChars.ToCharArray(); } SentenceModel model; try { Dictionary dict = loadDict(@params.AbbDict); SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(@params.Factory, @params.Lang, true, dict, eos); model = SentenceDetectorME.train(@params.Lang, sampleStream, sdFactory, mlParams); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e); } finally { try { sampleStream.close(); } catch (IOException) { // sorry that this can fail } } CmdLineUtil.writeModel("sentence detector", modelOutFile, model); }
private void LoadSentenceDetector() { if (!alreadyLoadSentenceDetector) { java.io.FileInputStream modelInpStream = new java.io.FileInputStream("Resources\\en-sent.bin"); SentenceModel sentenceModel = new SentenceModel(modelInpStream); sentenceDetector = new SentenceDetectorME(sentenceModel); alreadyLoadSentenceDetector = true; } }
public Span[] SentPosDetect(string paragraph) { var bin = GetFileStream("en-sent.bin"); SentenceModel model = new SentenceModel(bin); SentenceDetectorME sdetector = new SentenceDetectorME(model); Span[] sentences = sdetector.sentPosDetect(paragraph); bin.close(); return(sentences); }
public static SentenceModel TrainModel(string path) { FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read); TrainingParameters trainParams = new TrainingParameters(); trainParams.Set(Parameters.Iterations, "100"); trainParams.Set(Parameters.Cutoff, "0"); SentenceDetectorFactory detectorFactory = new SentenceDetectorFactory(TRAINING_LANGUAGE, true, null, null); SentenceSampleStream sampleStream = new SentenceSampleStream(new PlainTextByLineStream(fs)); return(SentenceDetectorME.Train(TRAINING_LANGUAGE, sampleStream, detectorFactory, trainParams)); }
static void Sent(string src) { StreamReader input = new StreamReader(src); StreamWriter output = new StreamWriter(Regex.Replace(src, "(.*).txt", "$1_sent.txt")); while (input.Peek() != -1) { InputStream modelIn = new FileInputStream(@"..\..\..\en-sent.bin"); SentenceModel smodel = new SentenceModel(modelIn); SentenceDetector detector = new SentenceDetectorME(smodel); string[] sents = detector.sentDetect(input.ReadLine()); foreach (string sent in sents) { output.WriteLine(sent); } } }
public void AbbreviationDefaultBehaviorTest() { var samples = "Test E-mail met zowel letsel als 12. Toedracht in het onderwerp." + Environment.NewLine + "Dit is een 2e regel met een tel. 011-4441444 erin." + Environment.NewLine + "Dit is een 2e regel." + Environment.NewLine + "Dit is een 2e regel." + Environment.NewLine + Environment.NewLine + "Dit is een 2e regel met een tel. 033-1333123 erin!" + Environment.NewLine + "Test E-mail met zowel winst als 12. toedracht in het onderwerp." + Environment.NewLine + "Dit is een 2e regel!" + Environment.NewLine + "Dit is een 2e regel." + Environment.NewLine; var stringsToIgnoreDictionary = new SharpNL.Dictionary.Dictionary(false) { { "12. Toedracht" }, { "Tel." }, }; var trainingParameters = new TrainingParameters(); trainingParameters.Set(Parameters.Algorithm, "MAXENT"); trainingParameters.Set(Parameters.TrainerType, "Event"); trainingParameters.Set(Parameters.Iterations, "100"); trainingParameters.Set(Parameters.Cutoff, "5"); char[] eos = { '.', '?', '!' }; var sdFactory = new SentenceDetectorFactory("nl", true, stringsToIgnoreDictionary, eos); var stringReader = new StringReader(samples); var stream = new SentenceSampleStream(new PlainTextByLineStream(stringReader)); var sentenceModel = SentenceDetectorME.Train("nl", stream, sdFactory, trainingParameters); var sentenceDetectorMe = new SentenceDetectorME(sentenceModel); var sentences = sentenceDetectorMe.SentDetect(samples); var expected = samples.Split(new [] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); Assert.AreEqual(8, sentences.Length); for (var i = 0; i < sentences.Length; i++) { Assert.AreEqual(expected[i], sentences[i]); } }
public void TestSentenceDetector() { using (var file = Tests.OpenFile("/opennlp/tools/sentdetect/Sentences.txt")) { var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); var sdFactory = new SentenceDetectorFactory("en", true, null, null); var stream = new SentenceSampleStream(new PlainTextByLineStream(file)); var model = SentenceDetectorME.Train("en", stream, sdFactory, mlParams); Assert.AreEqual("en", model.Language); Assert.AreEqual(model.UseTokenEnd, true); EvalSentences(new SentenceDetectorME(model)); } }
public void SplitSentences() { var txt = File.ReadAllText(@"c:\dev\d-mill\uspe\Data\uspe-1.txt"); txt = Regex.Replace(txt, "\\s+", " "); txt = Regex.Replace(txt, "\\r\\n", ""); txt = Regex.Replace(txt, "MR.\\s+", "MR."); var modelStream = new java.io.FileInputStream("../../Models/en-sent.bin"); var model = new SentenceModel(modelStream); var detector = new SentenceDetectorME(model); var sentences = detector.sentDetect(txt); File.WriteAllLines(@"c:\dev\d-mill\uspe\Data\uspe-sentenced.txt", sentences); }
public static void Main(string[] args) { DirectoryInfo folder = new DirectoryInfo(@"..\..\..\..\..\Dataset"); foreach (var fname in folder.GetFiles()) { String line = File.ReadAllText(fname.FullName); java.io.InputStream modelIn = new java.io.FileInputStream(@"..\..\..\..\..\en-sent.bin"); SentenceModel smodel = new SentenceModel(modelIn); SentenceDetector detector = new SentenceDetectorME(smodel); string[] sents = detector.sentDetect(line); using (StreamWriter sw = new StreamWriter(fname.FullName.Replace(fname.FullName.Substring(fname.FullName.Length - 3), "rtf"))) { foreach (var sent in sents) { sw.WriteLine(sent); } } } }
/// <summary> /// Perform sentence detection the input stream. /// /// A newline will be treated as a paragraph boundary. /// </summary> public override void run(string[] args) { if (args.Length != 1) { Console.WriteLine(Help); } else { SentenceModel model = (new SentenceModelLoader()).load(new File(args[0])); SentenceDetectorME sdetector = new SentenceDetectorME(model); ObjectStream <string> paraStream = new ParagraphStream(new PlainTextByLineStream(new InputStreamReader(Console.OpenStandardInput))); PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent"); perfMon.start(); try { string para; while ((para = paraStream.read()) != null) { string[] sents = sdetector.sentDetect(para); foreach (string sentence in sents) { Console.WriteLine(sentence); } perfMon.incrementCounter(sents.Length); Console.WriteLine(); } } catch (IOException e) { CmdLineUtil.handleStdinIoError(e); } perfMon.stopAndPrintFinalResult(); } }
public NLP() { //loading sentence detector model java.io.FileInputStream modelInpStream = new java.io.FileInputStream("Resources\\en-sent.bin"); SentenceModel sentenceModel = new SentenceModel(modelInpStream); sentenceDetector = new SentenceDetectorME(sentenceModel); //loading tokenizer model modelInpStream = new java.io.FileInputStream("Resources\\en-token.bin"); TokenizerModel tokenizerModel = new TokenizerModel(modelInpStream); tokenizer = new TokenizerME(tokenizerModel); modelInpStream = new java.io.FileInputStream("Resources\\en-pos-maxent.bin"); POSModel posModel = new POSModel(modelInpStream); tagger = new POSTaggerME(posModel); modelInpStream = new java.io.FileInputStream("Resources\\en-chunker.bin"); ChunkerModel chunkerModel = new ChunkerModel(modelInpStream); chunker = new ChunkerME(chunkerModel); modelInpStream = new java.io.FileInputStream("Resources\\en-parser-chunking.bin"); ParserModel parserModel = new ParserModel(modelInpStream); parser = ParserFactory.create(parserModel); //loading stop words list StreamReader sr = new StreamReader("Resources\\english.stop.txt"); string line; while ((line = sr.ReadLine()) != null) { stopwords.Add(Stemming(line)); stopwords.Add(line); } }
static void Main(string[] args) { StreamWriter sw = new StreamWriter(@"..\..\Data\result.txt"); StreamReader sr = new StreamReader(@"..\..\Data\data.txt"); while (sr.Peek() != -1) { string line = sr.ReadLine(); java.io.InputStream modelIn = new java.io.FileInputStream("en-sent.bin"); SentenceModel smodel = new SentenceModel(modelIn); SentenceDetector detector = new SentenceDetectorME(smodel); string[] sents = detector.sentDetect(line); foreach (var sent in sents) { sw.WriteLine(sent); sw.WriteLine(); } sw.Flush(); } sr.Close(); sw.Close(); }
static void Main(string[] args) { java.io.InputStream modelIn = new java.io.FileInputStream(string.Format("en-sent.bin")); java.io.InputStream modelIn2 = new java.io.FileInputStream(string.Format("en-token.bin")); TokenizerModel model = new TokenizerModel(modelIn2); TokenizerME mE = new TokenizerME(model); SentenceModel sM = new SentenceModel(modelIn); SentenceDetector detector = new SentenceDetectorME(sM); string folderName = @"C:\Users\Administrator\Desktop\lab-6-opennlp-ju-zi-qie-fen-10411174\file"; foreach (string fname in System.IO.Directory.GetFiles(folderName)) { String line = null; String[] name = fname.Split('\\'); StreamWriter sw = new StreamWriter(@"C:\Users\Administrator\Desktop\lab-6-opennlp-ju-zi-qie-fen-10411174\answer\" + name[6]); StreamReader file2 = new StreamReader(fname); while ((line = file2.ReadLine()) != null) { string str = null; string[] sents = detector.sentDetect(line); if (sents.Length.Equals(0)) { continue; } foreach (var s in sents) { str = str + s; } var Tokens = mE.tokenize(str); foreach (var s in Tokens) { sw.Write(s + " "); } sw.WriteLine(); } sw.Close(); } }
private static SentenceModel Train(SentenceDetectorFactory factory) { return(SentenceDetectorME.Train("en", CreateSampleStream(), factory, TrainingParameters.DefaultParameters())); }
// Constructors and finalizers: private Repository() { _assemblyName = Regex.Match(_assemblyFullName, "^(.*?),.*$").Result("$1"); _rootDrive = ("/usr/project/xtmp/dp195/Poetix18/").Replace(@"\", Dsc); _nlpFolder = ("rhetorica/nlp/").Replace(@"\", Dsc); _openNlpModelsFolder = ("OpenNLP/models/").Replace(@"\", Dsc); _openNlpModelsPath = RootDrive + _nlpFolder + _openNlpModelsFolder; _wordNetFolder = ("WordNet_3/").Replace(@"\", Dsc); _wordNetPath = RootDrive + _nlpFolder + _wordNetFolder; _grammarFolder = ("StanfordParser/grammar/").Replace(@"\", Dsc); _grammarPath = RootDrive + _nlpFolder + _grammarFolder; _dataFolder = ("data/").Replace(@"\", Dsc); _nlpTextsPath = RootDrive + _dataFolder; string[] localTextDirectoryParts = { CurrentAssemblyDirectoryPath, "..", "..","..", "data" //"..", "..", "text" }; _localTextPath = Path.Combine(localTextDirectoryParts) + "/"; // For development use // WordNet engine: Console.Write("Loading WordNet engine.... "); _wordNetEngine = new WordNetEngine(WordNetPath, true); Console.WriteLine("Done."); // OpenNLP sentence detector: Console.Write("Loading OpenNLP sentence detector.... "); java.io.FileInputStream modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-sent.bin"); _sentenceModel = new SentenceModel(modelInputStream); modelInputStream.close(); _sentenceDetector = new SentenceDetectorME(_sentenceModel); Console.WriteLine("Done."); // OpenNLP tokenizer: Console.Write("Loading OpenNLP tokenizer.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-token.bin"); _tokenizerModel = new opennlp.tools.tokenize.TokenizerModel(modelInputStream); modelInputStream.close(); _tokenizer = new opennlp.tools.tokenize.TokenizerME(_tokenizerModel); Console.WriteLine("Done."); // OpenNLP name finder: Console.Write("Loading OpenNLP name finder.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-ner-person.bin"); _tokenNameFinderModel = new TokenNameFinderModel(modelInputStream); modelInputStream.close(); _nameFinder = new NameFinderME(_tokenNameFinderModel); Console.WriteLine("Done."); // OpenNLP POS tagger: Console.Write("Loading OpenNLP POS tagger.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-pos-maxent.bin"); _posModel = new POSModel(modelInputStream); modelInputStream.close(); _tagger = new POSTaggerME(_posModel); Console.WriteLine("Done."); // OpenNLP chunker: Console.Write("Loading OpenNLP chunker.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-chunker.bin"); _chunkerModel = new ChunkerModel(modelInputStream); modelInputStream.close(); _chunker = new ChunkerME(_chunkerModel); Console.WriteLine("Done."); // OpenNLP parser: if (_loadParser) { Console.Write("Loading OpenNLP parser.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-parser-chunking.bin"); _parserModel = new ParserModel(modelInputStream); modelInputStream.close(); _parser = ParserFactory.create(_parserModel); Console.WriteLine("Done."); } // Stanford parser: //_stanfordParser = new LexicalizedParser(GrammarPath + "englishPCFG.ser.gz"); // Obsolete method _stanfordParser = LexicalizedParser.loadModel(GrammarPath + "englishPCFG.ser.gz"); // Porter stemmer: _porterStemmer = new PorterStemmer(); }
public SentenceDetector(SentenceModel model) { this.detector = new SentenceDetectorME(model); }
public SentenceDetector() { this.detector = new SentenceDetectorME(TrainModel(Environment.CurrentDirectory + TRAINING_MODEL_PATH)); }
public SentenceDetector(FileStream modelStream) { SentenceModel model = new SentenceModel(modelStream); this.detector = new SentenceDetectorME(model); }
public NLPSentenceDetectorOp() { sentenceSplitter = null; }
public NLPSentenceDetectorOp(SentenceModel model) { sentenceSplitter = new SentenceDetectorME(model); }
public String chooseSentenceMenu() { int userInputNumber = 0; string userChoosenSentence = ""; System.Console.WriteLine("Choose a sentence to use from your current text"); System.Console.WriteLine("Must be a space between each sentence"); try { java.io.File file = new java.io.File("C:\\en-sent.bin"); java.io.InputStream modelIn = new FileInputStream("C:\\Users\\jcoleman\\Documents\\Capstone\\jcoleman_Capstone\\Code\\NEWCHATBOT\\ConsoleBot\\ConsoleBot\\en-sent.bin"); SentenceModel model = new SentenceModel(modelIn); SentenceDetectorME sentenceDetector = new SentenceDetectorME(model); string text = ""; FileText = System.IO.File.ReadAllLines(FilePath); for (int i = 0; i < FileText.Length; i++) { text += FileText[i]; } string[] sentences = sentenceDetector.sentDetect(text); for(int s = 0;s < sentences.Length;s++) { System.Console.WriteLine((s+1) +" : " +sentences[s]); } string userInput = System.Console.ReadLine(); userInputNumber = int.Parse(userInput); userChoosenSentence = sentences[userInputNumber - 1]; modelIn.close(); } catch(Exception e) { System.Console.WriteLine(e.Message); } return userChoosenSentence; }
internal static void EvalSentences(SentenceDetectorME sentDetect) { const string sampleSentences1 = "This is a test. There are many tests, this is the second."; var sents = sentDetect.SentDetect(sampleSentences1); Assert.AreEqual(sents.Length, 2); Assert.AreEqual(sents[0], "This is a test."); Assert.AreEqual(sents[1], "There are many tests, this is the second."); var probs = sentDetect.GetSentenceProbabilities(); Assert.AreEqual(probs.Length, 2); const string sampleSentences2 = "This is a test. There are many tests, this is the second"; sents = sentDetect.SentDetect(sampleSentences2); Assert.AreEqual(sents.Length, 2); probs = sentDetect.GetSentenceProbabilities(); Assert.AreEqual(probs.Length, 2); Assert.AreEqual(sents[0], "This is a test."); Assert.AreEqual(sents[1], "There are many tests, this is the second"); const string sampleSentences3 = "This is a \"test\". He said \"There are many tests, this is the second.\""; sents = sentDetect.SentDetect(sampleSentences3); Assert.AreEqual(sents.Length, 2); probs = sentDetect.GetSentenceProbabilities(); Assert.AreEqual(probs.Length, 2); Assert.AreEqual(sents[0], "This is a \"test\"."); Assert.AreEqual(sents[1], "He said \"There are many tests, this is the second.\""); const string sampleSentences4 = "This is a \"test\". I said \"This is a test.\" Any questions?"; sents = sentDetect.SentDetect(sampleSentences4); Assert.AreEqual(sents.Length, 3); probs = sentDetect.GetSentenceProbabilities(); Assert.AreEqual(probs.Length, 3); Assert.AreEqual(sents[0], "This is a \"test\"."); Assert.AreEqual(sents[1], "I said \"This is a test.\""); Assert.AreEqual(sents[2], "Any questions?"); const string sampleSentences5 = "This is a one sentence test space at the end. "; sents = sentDetect.SentDetect(sampleSentences5); Assert.AreEqual(1, sentDetect.GetSentenceProbabilities().Length); Assert.AreEqual(sents[0], "This is a one sentence test space at the end."); const string sampleSentences6 = "This is a one sentences test with tab at the end. "; sents = sentDetect.SentDetect(sampleSentences6); Assert.AreEqual(sents[0], "This is a one sentences test with tab at the end."); const string sampleSentences7 = "This is a test. With spaces between the two sentences."; sents = sentDetect.SentDetect(sampleSentences7); Assert.AreEqual(sents[0], "This is a test."); Assert.AreEqual(sents[1], "With spaces between the two sentences."); const string sampleSentences9 = ""; sents = sentDetect.SentDetect(sampleSentences9); Assert.AreEqual(0, sents.Length); const string sampleSentences10 = " "; // whitespaces and tabs sents = sentDetect.SentDetect(sampleSentences10); Assert.AreEqual(0, sents.Length); const string sampleSentences11 = "This is test sentence without a dot at the end and spaces "; sents = sentDetect.SentDetect(sampleSentences11); Assert.AreEqual(sents[0], "This is test sentence without a dot at the end and spaces"); probs = sentDetect.GetSentenceProbabilities(); Assert.AreEqual(1, probs.Length); const string sampleSentence12 = " This is a test."; sents = sentDetect.SentDetect(sampleSentence12); Assert.AreEqual(sents[0], "This is a test."); const string sampleSentence13 = " This is a test"; sents = sentDetect.SentDetect(sampleSentence13); Assert.AreEqual(sents[0], "This is a test"); // Test that sentPosDetect also works var pos = sentDetect.SentPosDetect(sampleSentences2); Assert.AreEqual(pos.Length, 2); probs = sentDetect.GetSentenceProbabilities(); Assert.AreEqual(probs.Length, 2); Assert.AreEqual(new Span(0, 15), pos[0]); Assert.AreEqual(new Span(16, 56), pos[1]); }
public static string[] SplitSentences(string Text) { var modelStream = new java.io.ByteArrayInputStream(Resource.en_sent); var model = new SentenceModel(modelStream); var detector = new SentenceDetectorME(model); return detector.sentDetect(Text); }