public void StressTest_Paragraphs() { CorpusFileReader reader = new CorpusFileReader(false); ParagraphSplitter ps = new ParagraphSplitter(Dialect.LooseyGoosey); foreach (string s in reader.NextFile()) { try { Prose prose = ps.ParseProse(s); foreach (Paragraph paragraph in prose.Paragraphs) { Console.WriteLine(paragraph); Console.WriteLine(" ---"); } } catch (Exception ex) { Console.WriteLine(ex); Console.WriteLine(s); } } }
public void TestOneSentenceIsNotNull() { ParagraphSplitter ps = new ParagraphSplitter(Dialect.LooseyGoosey); Prose prose = ps.ParseProse("ni li pona kin."); Assert.NotNull(prose); Console.WriteLine(prose.ToString()); }
public void TestTwoParagraphsManySentenceEach() { ParagraphSplitter ps = new ParagraphSplitter(Dialect.LooseyGoosey); Prose prose = ps.ParseProse(@"ni li pona kin. ale li pona. moku li pona. ni li pona kin. soweli li pona. waso li tawa kon."); Assert.NotNull(prose); Assert.AreEqual(2, prose.Paragraphs.Length); Console.WriteLine(prose.ToString()); }
public void StressTest_ParsePARAGRAPHS_SpitBack_LooselyCompare() { CorpusFileReader reader = new CorpusFileReader(true); int i = 0; Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); ParagraphSplitter paragraphSplitter = new ParagraphSplitter(dialect); paragraphSplitter.ThrowOnErrors = false; Normalizer norm = new Normalizer(dialect); SentenceSplitter ss = new SentenceSplitter(dialect); int total = 0; int j = 0; foreach (string s in reader.NextFile()) { if (reader.currentFile.Contains("ipoC")) { continue; } Prose prose = paragraphSplitter.ParseProse(s); foreach (Paragraph paragraph in prose.Paragraphs) { foreach (Sentence sentence in paragraph) { try { string repeatBack = sentence.ToString(); string original = sentence.Diagnostics.Original; if (!repeatBack.TpLettersEqual(original)) { Console.WriteLine("O: " + original.Trim(new[] { ' ', '\t', '\n', '\r' }).Replace("\n", " ")); Console.WriteLine("G: " + repeatBack); Console.WriteLine(" --- "); j++; } } catch (Exception) { i++; } } } } Console.WriteLine("Total: " + total); Console.WriteLine("Mismatched: " + j); Console.WriteLine("Failed Sentences: " + i); }
public string GlossProse(Prose prose, Dialect dialect, bool includePos = false) { StringBuilder sb = new StringBuilder(); foreach (Paragraph p in prose.Paragraphs) { foreach (Sentence sentence in p) { sb.Append(GlossOneSentence(includePos, sentence, dialect)); } sb.AppendLine(); } return(sb.ToString()); }
public Prose ParseProse(string text, string policy = "guess") { text = text.Trim(new[] { '\r', '\n' }); List <Paragraph> paras = new List <Paragraph>(); string[] doubleSpace = { "\r\n\r\n", "\n\r\n\r", "\n\n\n\n", "\r\r\r\r" }; string[] paraStrings = text.Split(doubleSpace, StringSplitOptions.RemoveEmptyEntries); string[] singleSpace = { "\n", "\n" }; if (paraStrings.Length == 1) { //We only got 1 paragraph. Was this because the document is short? //Or because we actually are seeing this: // // blah blah. blah blah. (para break) // blah blah. blah blah. (para break) // string[] alternative = text.Split(singleSpace, StringSplitOptions.RemoveEmptyEntries); //If new line is a para break, then every line should have at least onen sentence //terminator if (alternative.All(x => x.Contains(".") || x.Contains("?") || x.Contains("!"))) { paraStrings = alternative; } } //int max = paraStrings.Max(x=>x.Length); //if(max.) Speaker speaker = null; string title = null; int i = 1; SentenceSplitter ss = new SentenceSplitter(dialect); Normalizer norm = new Normalizer(dialect); foreach (string paraString in paraStrings) { string[] sentenceStrings = ss.ParseIntoNonNormalizedSentences(paraString); Paragraph para = new Paragraph(); foreach (string sentenceString in sentenceStrings) { string normalized = norm.NormalizeText(sentenceString); Sentence sentence; if (ThrowOnErrors) { sentence = pu.ParsedSentenceFactory(normalized, sentenceString); } else { try { sentence = pu.ParsedSentenceFactory(normalized, sentenceString); } catch (Exception ex) { sentence = new Sentence(new NullOrSymbols(ex.Message), new SentenceDiagnostics(sentenceString, normalized)); } } if (i == 1 && sentence.Fragment != null) { title = sentence.ToString(); } if (i == 2 && sentence.Fragment != null) { if (sentence.Fragment.Contains(Words.jan) || sentence.Fragment.Contains(Words.meli) || sentence.Fragment.Contains(Words.mije)) { title = sentence.ToString(); speaker = new Speaker(sentence.ToString()); } } para.Add(sentence); } paras.Add(para); } Prose p = new Prose(paras.ToArray(), title, speaker, DateTime.Now.ToString()); return(p); }