Ejemplo n.º 1
0
        public void StressTest_Paragraphs()
        {
            CorpusFileReader reader = new CorpusFileReader(false);

            ParagraphSplitter ps = new ParagraphSplitter(Dialect.LooseyGoosey);

            foreach (string s in reader.NextFile())
            {
                try
                {
                    Prose prose = ps.ParseProse(s);

                    foreach (Paragraph paragraph in prose.Paragraphs)
                    {
                        Console.WriteLine(paragraph);
                        Console.WriteLine("   ---");
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex);
                    Console.WriteLine(s);
                }
            }
        }
Ejemplo n.º 2
0
        public void TestOneSentenceIsNotNull()
        {
            ParagraphSplitter ps    = new ParagraphSplitter(Dialect.LooseyGoosey);
            Prose             prose = ps.ParseProse("ni li pona kin.");

            Assert.NotNull(prose);
            Console.WriteLine(prose.ToString());
        }
Ejemplo n.º 3
0
        public void TestTwoParagraphsManySentenceEach()
        {
            ParagraphSplitter ps    = new ParagraphSplitter(Dialect.LooseyGoosey);
            Prose             prose = ps.ParseProse(@"ni li pona kin. ale li pona. moku li pona.

ni li pona kin. soweli li pona. waso li tawa kon.");

            Assert.NotNull(prose);
            Assert.AreEqual(2, prose.Paragraphs.Length);
            Console.WriteLine(prose.ToString());
        }
        public void StressTest_ParsePARAGRAPHS_SpitBack_LooselyCompare()
        {
            CorpusFileReader reader = new CorpusFileReader(true);

            int         i       = 0;
            Dialect     dialect = Dialect.LooseyGoosey;
            ParserUtils pu      = new ParserUtils(dialect);

            ParagraphSplitter paragraphSplitter = new ParagraphSplitter(dialect);

            paragraphSplitter.ThrowOnErrors = false;
            Normalizer norm = new Normalizer(dialect);

            SentenceSplitter ss = new SentenceSplitter(dialect);

            int total = 0;
            int j     = 0;

            foreach (string s in reader.NextFile())
            {
                if (reader.currentFile.Contains("ipoC"))
                {
                    continue;
                }
                Prose prose = paragraphSplitter.ParseProse(s);
                foreach (Paragraph paragraph in prose.Paragraphs)
                {
                    foreach (Sentence sentence in paragraph)
                    {
                        try
                        {
                            string repeatBack = sentence.ToString();
                            string original   = sentence.Diagnostics.Original;
                            if (!repeatBack.TpLettersEqual(original))
                            {
                                Console.WriteLine("O: " + original.Trim(new[] { ' ', '\t', '\n', '\r' }).Replace("\n", " "));
                                Console.WriteLine("G: " + repeatBack);
                                Console.WriteLine(" --- ");
                                j++;
                            }
                        }
                        catch (Exception)
                        {
                            i++;
                        }
                    }
                }
            }
            Console.WriteLine("Total: " + total);
            Console.WriteLine("Mismatched: " + j);
            Console.WriteLine("Failed Sentences: " + i);
        }
Ejemplo n.º 5
0
        public string GlossProse(Prose prose, Dialect dialect, bool includePos = false)
        {
            StringBuilder sb = new StringBuilder();

            foreach (Paragraph p in prose.Paragraphs)
            {
                foreach (Sentence sentence in p)
                {
                    sb.Append(GlossOneSentence(includePos, sentence, dialect));
                }
                sb.AppendLine();
            }
            return(sb.ToString());
        }
        public Prose ParseProse(string text, string policy = "guess")
        {
            text = text.Trim(new[] { '\r', '\n' });
            List <Paragraph> paras = new List <Paragraph>();

            string[] doubleSpace = { "\r\n\r\n", "\n\r\n\r", "\n\n\n\n", "\r\r\r\r" };
            string[] paraStrings = text.Split(doubleSpace, StringSplitOptions.RemoveEmptyEntries);


            string[] singleSpace = { "\n", "\n" };



            if (paraStrings.Length == 1)
            {
                //We only got 1 paragraph. Was this because the document is short?
                //Or because we actually are seeing this:
                //
                //  blah blah. blah blah. (para break)
                //  blah blah. blah blah. (para break)
                //
                string[] alternative = text.Split(singleSpace, StringSplitOptions.RemoveEmptyEntries);

                //If new line is a para break, then every line should have at least onen sentence
                //terminator
                if (alternative.All(x => x.Contains(".") || x.Contains("?") || x.Contains("!")))
                {
                    paraStrings = alternative;
                }
            }

            //int max = paraStrings.Max(x=>x.Length);
            //if(max.)


            Speaker speaker = null;
            string  title   = null;
            int     i       = 1;

            SentenceSplitter ss = new SentenceSplitter(dialect);


            Normalizer norm = new Normalizer(dialect);

            foreach (string paraString in paraStrings)
            {
                string[] sentenceStrings = ss.ParseIntoNonNormalizedSentences(paraString);

                Paragraph para = new Paragraph();
                foreach (string sentenceString in sentenceStrings)
                {
                    string normalized = norm.NormalizeText(sentenceString);

                    Sentence sentence;
                    if (ThrowOnErrors)
                    {
                        sentence = pu.ParsedSentenceFactory(normalized, sentenceString);
                    }
                    else
                    {
                        try
                        {
                            sentence = pu.ParsedSentenceFactory(normalized, sentenceString);
                        }
                        catch (Exception ex)
                        {
                            sentence = new Sentence(new NullOrSymbols(ex.Message), new SentenceDiagnostics(sentenceString, normalized));
                        }
                    }
                    if (i == 1 && sentence.Fragment != null)
                    {
                        title = sentence.ToString();
                    }
                    if (i == 2 && sentence.Fragment != null)
                    {
                        if (sentence.Fragment.Contains(Words.jan) ||
                            sentence.Fragment.Contains(Words.meli) ||
                            sentence.Fragment.Contains(Words.mije))
                        {
                            title   = sentence.ToString();
                            speaker = new Speaker(sentence.ToString());
                        }
                    }
                    para.Add(sentence);
                }
                paras.Add(para);
            }

            Prose p = new Prose(paras.ToArray(), title, speaker, DateTime.Now.ToString());

            return(p);
        }