public void SpellCheck_ForStressTest_UsingOnlyWordConstructor()
        {
            CorpusFileReader            reader = new CorpusFileReader();
            Dictionary <string, string> bad    = new Dictionary <string, string>();
            List <string>    good    = new List <string>();
            TokenParserUtils tpu     = new TokenParserUtils();
            Dialect          dialect = Dialect.LooseyGoosey;

            SentenceSplitter ss = new SentenceSplitter(dialect);

            Normalizer norm = new Normalizer(dialect);

            foreach (string s in reader.NextFile())
            {
                string[] rawSentences = ss.ParseIntoNonNormalizedSentences(s);
                foreach (string sentence in rawSentences)
                {
                    string normalized = norm.NormalizeText(sentence);
                    //Normalization improved stuff
                    string[] words = tpu.JustTokens(normalized);
                    for (int index = 0; index < words.Length; index++)
                    {
                        //Don't remove double quotes or we can't ID some marked foreign text.
                        //'"'
                        words[index] = words[index].Trim(new[] { ':', '.', '\'', '«', '»', '!', '?', '-', '[', ']' });
                    }
                    foreach (string word in words.Where(x => !string.IsNullOrEmpty(x)))
                    {
                        if (good.Contains(word))
                        {
                            continue;
                        }

                        try
                        {
                            Word w = new Word(word);
                        }
                        catch (Exception ex)
                        {
                            if (bad.ContainsKey(word))
                            {
                                bad[word] = (Convert.ToInt32(bad[word]) + 1).ToString();
                                //bad.Add(word, ex.Message);
                            }
                            else
                            {
                                bad.Add(word, "1");
                            }
                        }
                    }
                }
            }
            foreach (KeyValuePair <string, string> pair in bad)
            {
                if (Convert.ToInt32(pair.Value) > 10)
                {
                    Console.WriteLine("Uh-oh: " + pair.Key + " " + pair.Value);
                }
            }
        }
        public void WhyDidNormalizerStripOffTheDoubleQuotes()
        {
            //,CorpusTexts.JanSin  //Too many neologisms to cope.
            string[] samples =
            {
                CorpusTexts.GeorgeSong
            };
            Dialect dialect = Dialect.LooseyGoosey;

            dialect.TargetGloss = "en";

            //ParserUtils pu = new ParserUtils(dialect);
            SentenceSplitter ss = new SentenceSplitter(dialect);

            Normalizer norm = new Normalizer(dialect);

            foreach (string sample in samples)
            {
                string[] sentences = ss.ParseIntoNonNormalizedSentences(sample);
                foreach (string sentence in sentences)
                {
                    if (sentence.ContainsCheck("Georgia"))
                    {
                        string result = norm.NormalizeText(sentence);
                        Assert.IsTrue(result.ContainsCheck("\"Georgia\""));
                    }
                }
            }
        }
        public void Normalization_Explicit_IsIdempotent()
        {
            //Normalize implicit is not expected to be indempotent.

            int               i       = 0;
            Dialect           dialect = Dialect.LooseyGoosey;
            NormalizeExplicit norm    = new NormalizeExplicit(dialect);
            SentenceSplitter  ss      = new SentenceSplitter(dialect);

            CorpusFileReader reader = new CorpusFileReader(true);

            foreach (string s in reader.NextFile())
            {
                if (reader.currentFile.ContainsCheck("janKipo"))
                {
                    continue;
                }

                string[] sentenceStrings = ss.ParseIntoNonNormalizedSentences(s);
                foreach (string sentence in sentenceStrings)
                {
                    string result1 = norm.NormalizeText(sentence);
                    string result2 = norm.NormalizeText(result1);
                    //Assert.AreEqual(result1,result2);
                    if (result1 != result2)
                    {
                        Console.WriteLine("1: " + (result1 ?? "NULL"));
                        Console.WriteLine("2: " + (result2 ?? "NULL"));
                    }
                    i++;
                }
            }
            Console.WriteLine("Sentences normalized: " + i);
        }
        public void IdentifyDiscourses_ParseKnownGoodTexts_ShowGoodOnes()
        {
            //,CorpusTexts.JanSin  //Too many neologisms to cope.
            string[] samples =
            {
                //CorpusTexts.UnpaText,
                //CorpusTexts.Gilgamesh,
                CorpusTexts.ProfesorAndMadMan,
                CorpusTexts.SampleText1,
                CorpusTexts.SampleText3,
                CorpusTexts.Lao,
                CorpusTexts.GeorgeSong,
                CorpusTexts.CrazyAnimal,
                CorpusTexts.CrazyAnimal2
                , CorpusTexts.RuneDanceSong
                , CorpusTexts.janPusaRice
                , CorpusTexts.janPend
            };
            Dialect dialect = Dialect.LooseyGoosey;

            dialect.TargetGloss = "en";

            GlossMaker  gm = new GlossMaker();
            ParserUtils pu = new ParserUtils(dialect);

            int fail = 0;


            SentenceSplitter ss = new SentenceSplitter(dialect);

            Normalizer norm = new Normalizer(dialect);

            foreach (string sample in samples)
            {
                string[] sentenceStrings = ss.ParseIntoNonNormalizedSentences(sample);
                string[] normalized      = new string[sentenceStrings.Length];
                for (int index = 0; index < sentenceStrings.Length; index++)
                {
                    //try
                    //{
                    normalized[index] = norm.NormalizeText(sentenceStrings[index]);
                    Sentence sentence = pu.ParsedSentenceFactory(normalized[index], sentenceStrings[index]);

                    Console.WriteLine(sentence.ToString("g"));
                    //}
                    //catch (Exception ex)
                    //{
                    //    fail++;
                    //    Console.WriteLine(sentenceStrings[index]);
                    //    Console.WriteLine(ex);
                    //}
                }

                Console.WriteLine(fail + " failed sentences.");
            }
        }
        public void StressTestNormalize_AnuSeme()
        {
            int         i       = 0;
            Dialect     dialect = Dialect.LooseyGoosey;
            ParserUtils pu      = new ParserUtils(dialect);

            Dialect english = Dialect.LooseyGoosey;

            english.TargetGloss        = "en";
            english.GlossWithFallBacks = true;

            CorpusFileReader reader = new CorpusFileReader(true);
            GlossMaker       gm     = new GlossMaker();
            SentenceSplitter ss     = new SentenceSplitter(dialect);

            Normalizer norm = new Normalizer(dialect);

            foreach (string s in reader.NextFile())
            {
                foreach (string original in ss.ParseIntoNonNormalizedSentences(s))
                {
                    //try
                    //{
                    string normalized = norm.NormalizeText(original);

                    if (!(normalized.ContainsWholeWord("anu seme")))
                    {
                        continue;
                    }
                    i++;
                    Sentence structured = pu.ParsedSentenceFactory(normalized, original);
                    string   diag       = structured.ToString("b");

                    Console.WriteLine("O: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' }));
                    Console.WriteLine("B: " + diag);
                    Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english));
                    //}
                    //catch (Exception ex)
                    //{
                    //    if (ex.Message.ContainsCheck("all tests"))
                    //    {
                    //        Console.WriteLine("ORIGINAL  : " + original);
                    //        if (structured != null)
                    //        {
                    //            Console.WriteLine(structured.ToString("b"));
                    //        }
                    //        Console.WriteLine(ex.Message);
                    //        i++;
                    //    }
                    //    else throw;
                    //}
                }
            }
            Console.WriteLine("Failed Sentences: " + i);
        }
Пример #6
0
        private static void ProcessSerializationsModel(SerializationsModel parse)
        {
            Dialect          dialect = Dialect.LooseyGoosey;
            ParserUtils      pu      = new ParserUtils(dialect);
            Normalizer       norm    = new Normalizer(dialect);
            SentenceSplitter ss      = new SentenceSplitter(dialect);

            string[] sentences = ss.ParseIntoNonNormalizedSentences(parse.SourceText);

            StringBuilder errors = new StringBuilder();

            List <Sentence> parseSentences = new List <Sentence>();
            int             i = 0;

            foreach (string sentence in sentences)
            {
                i++;
                if (i >= 3)
                {
                    continue;
                }
                string normalized;
                try
                {
                    normalized = norm.NormalizeText(sentence);
                }
                catch (Exception ex)
                {
                    normalized = "[[CANNOT NORMALIZE:  " + ex.Message + " for " + sentence + "]]";
                    errors.AppendLine(normalized + "<br/>");
                    continue;
                }


                Sentence parsedSentence;
                try
                {
                    parsedSentence = pu.ParsedSentenceFactory(normalized, sentence);
                    parseSentences.Add(parsedSentence);
                }
                catch (Exception ex)
                {
                    string cantParse = "[[CANNOT PARSE:  " + ex.Message.ToHtml() + " for " + sentence.ToHtml() + "]]";
                    errors.AppendLine(cantParse.ToHtml() + "<br/>");
                }
                finally
                {
                    dialect.TargetGloss = "tp";
                }
            }

            parse.Json = parseSentences.ToJsonNet();
            parse.Xml  = FormatXml(parseSentences.ToDataContractXml());
            parse.Html = "Not implemented yet";
        }
        public void StressTestNormalizeAndParseEverything()
        {
            int         i       = 0;
            int         total   = 0;
            Dialect     dialect = Dialect.LooseyGoosey;
            ParserUtils pu      = new ParserUtils(dialect);

            Dialect english = Dialect.LooseyGoosey;

            english.TargetGloss        = "en";
            english.GlossWithFallBacks = true;

            CorpusFileReader reader = new CorpusFileReader(true);
            GlossMaker       gm     = new GlossMaker();
            SentenceSplitter ss     = new SentenceSplitter(dialect);

            Normalizer norm  = new Normalizer(dialect);
            Stopwatch  watch = new Stopwatch();

            watch.Start();
            foreach (string s in reader.NextFile())
            {
                if (reader.currentFile.ContainsCheck("janKipoCollected"))
                {
                    continue;                                                       // Can't parse:  *janMato 123 123 ni li musi!
                }
                foreach (string original in ss.ParseIntoNonNormalizedSentences(s))
                {
                    total++;
                    //if (watch.ElapsedMilliseconds > 15000) return;
                    //if (total > 1000) return;
                    Sentence structured = null;
                    try
                    {
                        string normalized = norm.NormalizeText(original);
                        structured = pu.ParsedSentenceFactory(normalized, original);
                        string diag = structured.ToString("b");

                        string gloss = gm.GlossOneSentence(false, structured, english);
                        // Console.WriteLine("O: " + (original??"").Trim(new []{'\n','\r',' ','\t'}));
                        // Console.WriteLine("B: " + diag);
                        // Console.WriteLine("G: " + gloss);
                    }
                    catch (Exception ex)
                    {
                        i++;
                        Console.WriteLine(SentenceDiagnostics.CurrentSentence.Original);
                        Console.WriteLine(ex.Message);
                        //else throw;
                    }
                }
            }
            Console.WriteLine("Failed Sentences: " + i);
        }
Пример #8
0
        public void SplitterSplitsPeriodsTest()
        {
            SentenceSplitter splitter       = new SentenceSplitter();
            var           testSentence      = "This will be. Two sentences.";
            List <string> expectedSentences = new List <string> {
                "This will be.", "Two sentences."
            };
            List <string> sentences = splitter.Split(testSentence);

            Assert.That(sentences, Is.EquivalentTo(expectedSentences));
        }
Пример #9
0
        public void QuotedSentence()
        {
            string           s       = @"""Saint Augustine Prophecy"".";
            Dialect          dialect = Dialect.LooseyGoosey;
            SentenceSplitter ss      = new SentenceSplitter(dialect);

            string[] results = ss.ParseIntoNonNormalizedSentences(s);
            Console.WriteLine(results[0]);

            Assert.IsTrue(!results[0].Contains("\""));
            //Assert.IsTrue(results[0].StartCheck("\""));
        }
        public void StressTest_Parse_SpitBack_LooselyCompare()
        {
            int         i       = 0;
            Dialect     dialect = Dialect.LooseyGoosey;
            ParserUtils pu      = new ParserUtils(dialect);

            CorpusFileReader reader = new CorpusFileReader(true);
            Normalizer       norm   = new Normalizer(dialect);

            SentenceSplitter ss = new SentenceSplitter(dialect);

            int total = 0;
            int j     = 0;

            foreach (string s in reader.NextFile())
            {
                foreach (string original in ss.ParseIntoNonNormalizedSentences(s))
                {
                    if (original.StartCheck("*") && reader.currentFile.ContainsCheck("janKipoCollected"))  // Can't parse:  *janMato 123 123 ni li musi!
                    {
                        continue;
                    }
                    if (original.StartCheck("///"))  //Don't care if commengs got corrupted.
                    {
                        continue;
                    }

                    total++;
                    try
                    {
                        string normalized = norm.NormalizeText(original);

                        Sentence structured = pu.ParsedSentenceFactory(normalized, original);
                        string   diag       = structured.ToString();

                        if (!diag.TpLettersEqual(original))
                        {
                            Console.WriteLine("O: " + original.Trim(new[] { ' ', '\t', '\n', '\r' }).Replace("\n", " "));
                            Console.WriteLine("G: " + diag);
                            Console.WriteLine(" --- ");
                            j++;
                        }
                    }
                    catch (Exception)
                    {
                        i++;
                    }
                }
            }
            Console.WriteLine("Total: " + total);
            Console.WriteLine("Mismatched: " + j);
            Console.WriteLine("Failed Sentences: " + i);
        }
Пример #11
0
        public void QuotationMarksAreRemovedTest()
        {
            SentenceSplitter splitter       = new SentenceSplitter();
            var           testSentence      = "This will be. \"Four sentences. Also a quote.\" Trailing too.";
            List <string> expectedSentences = new List <string>
            {
                "This will be.", "Four sentences.", "Also a quote.", "Trailing too."
            };
            List <string> sentences = splitter.Split(testSentence);

            Assert.That(sentences, Is.EquivalentTo(expectedSentences));
        }
        public void StressTest_ParsePARAGRAPHS_SpitBack_LooselyCompare()
        {
            CorpusFileReader reader = new CorpusFileReader(true);

            int         i       = 0;
            Dialect     dialect = Dialect.LooseyGoosey;
            ParserUtils pu      = new ParserUtils(dialect);

            ParagraphSplitter paragraphSplitter = new ParagraphSplitter(dialect);

            paragraphSplitter.ThrowOnErrors = false;
            Normalizer norm = new Normalizer(dialect);

            SentenceSplitter ss = new SentenceSplitter(dialect);

            int total = 0;
            int j     = 0;

            foreach (string s in reader.NextFile())
            {
                if (reader.currentFile.Contains("ipoC"))
                {
                    continue;
                }
                Prose prose = paragraphSplitter.ParseProse(s);
                foreach (Paragraph paragraph in prose.Paragraphs)
                {
                    foreach (Sentence sentence in paragraph)
                    {
                        try
                        {
                            string repeatBack = sentence.ToString();
                            string original   = sentence.Diagnostics.Original;
                            if (!repeatBack.TpLettersEqual(original))
                            {
                                Console.WriteLine("O: " + original.Trim(new[] { ' ', '\t', '\n', '\r' }).Replace("\n", " "));
                                Console.WriteLine("G: " + repeatBack);
                                Console.WriteLine(" --- ");
                                j++;
                            }
                        }
                        catch (Exception)
                        {
                            i++;
                        }
                    }
                }
            }
            Console.WriteLine("Total: " + total);
            Console.WriteLine("Mismatched: " + j);
            Console.WriteLine("Failed Sentences: " + i);
        }
        public static string NormalizeExplicit(string sentence, Dialect dialect)
        {
            if (string.IsNullOrWhiteSpace(sentence))
            {
                throw new ArgumentNullException("sentence", "No null or blank sentences");
            }
            if (sentence.EndCheck(" "))
            {
                throw new ArgumentException("Must trim spaces before calling this", "sentence");
            }

            string normalized = SentenceSplitter.SwapQuoteAndSentenceTerminatorOrder(sentence);

            return(normalized);
        }
Пример #14
0
        public void FullyEnclosedQuoteSentencesAreCaptured()
        {
            SentenceSplitter splitter   = new SentenceSplitter();
            string           testString =
                "And the Universal AC answered. \"THERE IS AS YET INSUFFICIENT DATA FOR A MEANINGFUL ANSWER.\"";
            var splitSentences = splitter.Split(testString);

            var expectedSentences = new List <string>
            {
                "And the Universal AC answered.",
                "\"THERE IS AS YET INSUFFICIENT DATA FOR A MEANINGFUL ANSWER.\""
            };

            Assert.That(splitSentences, Is.EquivalentTo(expectedSentences));
        }
Пример #15
0
        public void SplitterCanSplitFullFile()
        {
            SentenceSplitter splitter   = new SentenceSplitter();
            string           testString =
                "VJ-23X was not really serious, but MQ-17J pulled out his AC-contact from his pocket and placed it on the table before him?\n\"I've half a mind to,\" he said! \"It's something the human race will have to face someday.";
            var splitSentences = splitter.Split(testString);

            var expectedSentences = new List <string>
            {
                "VJ-23X was not really serious, but MQ-17J pulled out his AC-contact from his pocket and placed it on the table before him?",
                "I've half a mind to,\" he said!", "It's something the human race will have to face someday."
            };

            Assert.That(splitSentences, Is.EquivalentTo(expectedSentences));
        }
        private static void Execute(string s)
        {
            Dialect          dialect = Dialect.LooseyGoosey;
            Normalizer       norm    = new Normalizer(dialect);
            ParserUtils      pu      = new ParserUtils(dialect);
            SentenceSplitter ss      = new SentenceSplitter(dialect);

            foreach (string original in ss.ParseIntoNonNormalizedSentences(s))
            {
                Console.WriteLine("----");
                string normalized = norm.NormalizeText(original);
                Console.WriteLine(normalized);
                Sentence structured = pu.ParsedSentenceFactory(normalized, original);
            }
        }
Пример #17
0
        public CorpusKnowledge(string corpus, Dialect dialect)
        {
            //https://stackoverflow.com/questions/521146/c-sharp-split-string-but-keep-split-chars-separators
            //https://stackoverflow.com/questions/3115150/how-to-escape-regular-expression-special-characters-using-javascript

            this.dialect = dialect;
            this.norm    = new Normalizer(dialect);

            SentenceSplitter ss = new SentenceSplitter(dialect);

            sentences = ss.ParseIntoNonNormalizedSentences(corpus);
            for (int index = 0; index < sentences.Length; index++)
            {
                sentences[index] = norm.NormalizeText(sentences[index]);
            }
        }
        public void IdentifyDiscourses_CanWeGroupThem()
        {
            Dialect    dialect = Dialect.LooseyGoosey;
            Normalizer norm    = new Normalizer(dialect);

            ParserUtils      pu = new ParserUtils(dialect);
            SentenceSplitter ss = new SentenceSplitter(dialect);

            Sentence[] s = ss
                           .ParseIntoNonNormalizedSentences(CorpusTexts.UnpaText)
                           .Where(x => !string.IsNullOrWhiteSpace(x))
                           .Select(x =>
            {
                string normalized = norm.NormalizeText(x);
                if (string.IsNullOrWhiteSpace(normalized))
                {
                    return(null);
                }
                return(pu.ParsedSentenceFactory(normalized, x));
            })
                           .Where(x => x != null)
                           .ToArray();
            Assert.Greater(s.Length, 0);


            List <Sentence>[] d = ss.GroupIntoDiscourses(s);

            Assert.Greater(d.Length, 0);

            foreach (List <Sentence> discourse in d)
            {
                Assert.Greater(discourse.Count, 0);
            }

            Console.WriteLine("-------------------");
            foreach (List <Sentence> discourse in d)
            {
                int i = 0;
                foreach (Sentence sentence in discourse)
                {
                    i++;
                    Console.WriteLine(i + ") " + sentence.ToString("b"));
                }

                Console.WriteLine("-------------------");
            }
        }
        public void SplitSentenceWithColon_Normalized()
        {
            const string     s       = "sina toki e ni: mi wile e ni.";
            Dialect          dialect = Dialect.LooseyGoosey;
            Normalizer       norm    = new Normalizer(dialect);
            SentenceSplitter ss      = new SentenceSplitter(dialect);

            string[] sentences = ss.ParseIntoNonNormalizedSentences(s);

            for (int index = 0; index < sentences.Length; index++)
            {
                sentences[index] = norm.NormalizeText(sentences[index]);
            }

            Assert.AreEqual("sina li toki e ni:", sentences[0]);
            Assert.AreEqual("mi li wile e ni.", sentences[1]);
        }
        public void NormalizeAllTextFiles()
        {
            int        i       = 0;
            Dialect    dialect = Dialect.LooseyGoosey;
            Normalizer norm    = new Normalizer(dialect);
            //ParserUtils pu  = new ParserUtils(dialect);

            CorpusFileReader reader = new CorpusFileReader();
            SentenceSplitter ss     = new SentenceSplitter(dialect);

            foreach (string s in reader.NextFile())
            {
                foreach (string sentence in ss.ParseIntoNonNormalizedSentences(s))
                {
                    string  result  = norm.NormalizeText(sentence);
                    decimal percent = NormalizeForeignText.PercentTokiPona(result);
                    Console.WriteLine(percent + "%");
                    i++;
                }
            }
            Console.WriteLine("Sentences normalized: " + i);
        }
        public void StressTestParseThingsWithNumbers()
        {
            int     i       = 0;
            Dialect dialect = Dialect.LooseyGoosey;

            dialect.InferCompoundsPrepositionsForeignText = false;
            dialect.InferNumbers = true;
            dialect.NumberType   = "Body";
            Normalizer       norm   = new Normalizer(dialect);
            CorpusFileReader reader = new CorpusFileReader();
            SentenceSplitter ss     = new SentenceSplitter(dialect);

            foreach (string s in reader.NextFile())
            {
                foreach (string original in ss.ParseIntoNonNormalizedSentences(s))
                {
                    try
                    {
                        string normalized = norm.NormalizeText(original);
                        string result     = NormalizeNumbers.FindNumbers(normalized, dialect);
                        if (result.ContainsCheck("#"))
                        {
                            Console.WriteLine("O: " + original);
                            Console.WriteLine("N: " + normalized);
                            Console.WriteLine("#: " + result);
                        }
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine("ORIGINAL  : " + original);
                        Console.WriteLine(ex.Message);
                        i++;
                    }
                }
            }
            Console.WriteLine("Failed Sentences: " + i);
        }
Пример #22
0
        public SentenceSplit()
        {
            InitializeComponent();

            SS = new SentenceSplitter();//默认的构造函数, 是才采用内置的词库,如果是采用外部的词库,在构造函数的参数里可以指定
        }
        public void StressTestMultipleLa()
        {
            int         i       = 0;
            Dialect     dialect = Dialect.LooseyGoosey;
            ParserUtils pu      = new ParserUtils(dialect);

            dialect.NumberType = "Stupid";

            Dialect english = Dialect.LooseyGoosey;

            english.TargetGloss        = "en";
            english.GlossWithFallBacks = true;

            Normalizer norm = new Normalizer(dialect);

            CorpusFileReader reader = new CorpusFileReader();
            GlossMaker       gm     = new GlossMaker();
            SentenceSplitter ss     = new SentenceSplitter(dialect);

            int total = 0;

            foreach (string s in reader.NextFile())
            {
                foreach (string original in ss.ParseIntoNonNormalizedSentences(s))
                {
                    if (original.Contains(" su "))
                    {
                        continue;                            //neologism, back when we didn't know what pu was and hoped it was something like scandinavian sem
                    }
                    if (original.Contains("o weka e  jan Opama tan tomo walo"))
                    {
                        continue;                                                        //quoted text treated as a content word.
                    }
                    try
                    {
                        string normalized = norm.NormalizeText(original);


                        //Multiple la's
                        if (normalized.Split(new string[] { " la " }, StringSplitOptions.RemoveEmptyEntries).Length <= 2)
                        {
                            continue;
                        }

                        Sentence structured = pu.ParsedSentenceFactory(normalized, original);
                        string   diag       = structured.ToString("b");
                        string   spitBack   = structured.ToString();

                        //if ((normalized.ContainsCheck("%ante"))) continue; //verb!

                        Console.WriteLine("Org: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' }));
                        Console.WriteLine("Rep: " + spitBack);
                        Console.WriteLine("Brk: " + diag);
                        //Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english));
                        total++;
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine("ORIGINAL  : " + original);
                        Console.WriteLine(ex.Message);
                        i++;
                    }
                }
            }
            Console.WriteLine("Total : " + total);
            Console.WriteLine("Failed Sentences: " + i);
        }
        public void StressTestNormalizeNotIndeedAlaKin()
        {
            int         i       = 0;
            Dialect     dialect = Dialect.LooseyGoosey;
            ParserUtils pu      = new ParserUtils(dialect);

            Dialect english = Dialect.LooseyGoosey;

            english.TargetGloss        = "en";
            english.GlossWithFallBacks = true;

            Normalizer norm = new Normalizer(dialect);

            CorpusFileReader reader = new CorpusFileReader();
            GlossMaker       gm     = new GlossMaker();
            SentenceSplitter ss     = new SentenceSplitter(dialect);

            foreach (string s in reader.NextFile())
            {
                foreach (string original in ss.ParseIntoNonNormalizedSentences(s))
                {
                    if (original.Contains(" su "))
                    {
                        continue;                            //neologism, back when we didn't know what pu was and hoped it was something like scandinavian sem
                    }
                    //try
                    //{
                    string normalized = norm.NormalizeText(original);

                    if (!(normalized.ContainsWholeWord("ala") || normalized.ContainsWholeWord("kin")))
                    {
                        continue;
                    }
                    if (normalized.ContainsCheck("Kinla"))
                    {
                        continue;                                   //Has a logical operator in one of the sample sentences that I can't deal with yet, unrelated to kin, ala
                    }
                    if (normalized.StartCheck("kin la "))
                    {
                        continue;                                   //no big deal
                    }
                    if (normalized.ContainsCheck("pilin pona o"))
                    {
                        continue;                                           //Not trying to solve vocatives right now
                    }
                    if (normalized.ContainsCheck(" o, "))
                    {
                        continue;                                   //Not trying to solve vocatives right now
                    }
                    Sentence structured = pu.ParsedSentenceFactory(normalized, original);
                    string   diag       = structured.ToString("b");

                    //if ((normalized.ContainsCheck("%ante"))) continue; //verb!

                    Console.WriteLine("O: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' }));
                    Console.WriteLine("B: " + diag);
                    Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english));
                    //}
                    //catch (Exception ex)
                    //{
                    //    if (ex.Message.ContainsCheck("all tests"))
                    //    {
                    //        Console.WriteLine("ORIGINAL  : " + original);
                    //        if (structured != null)
                    //        {
                    //            Console.WriteLine(structured.ToString("b"));
                    //        }
                    //        Console.WriteLine(ex.Message);
                    //        i++;
                    //    }
                    //    else throw;
                    //}
                }
            }
            Console.WriteLine("Failed Sentences: " + i);
        }
Пример #25
0
        private static void ProcessParserModelSentences(SimpleParserViewModel parse)
        {
            Dialect dialect = BindDialect(parse);

            ParserUtils      pu   = new ParserUtils(dialect);
            SentenceSplitter ss   = new SentenceSplitter(dialect);
            Normalizer       norm = new Normalizer(dialect);

            string[]      sentences    = ss.ParseIntoNonNormalizedSentences(parse.SourceText);
            StringBuilder normalizedSb = new StringBuilder();
            StringBuilder spitBackSb   = new StringBuilder();
            StringBuilder bracketSb    = new StringBuilder();
            StringBuilder posSb        = new StringBuilder();
            StringBuilder glossSb      = new StringBuilder();

            StringBuilder errors    = new StringBuilder();
            StringBuilder colorized = new StringBuilder();

            HtmlFormatter hf = new HtmlFormatter();

            int i = 1;

            foreach (string sentence in sentences)
            {
                string lineNumber = LineNumber(i, true);
                string normalized;
                try
                {
                    normalized = norm.NormalizeText(sentence);
                }
                catch (Exception ex)
                {
                    string error = "[[CANNOT NORMALIZE:  " + ex.Message + "]]";
                    normalizedSb.AppendLine(error.ToHtml() + "<br/>");
                    normalizedSb.AppendLine(hf.BoldTheWords(sentence.ToHtml()) + "<br/>");
                    normalized = sentence;
                    UpdateErrors(errors, error, sentence);
                }
                //////// TP
                normalizedSb.AppendLine(lineNumber + hf.BoldTheWords(normalized.ToHtml()) + "<br/>");

                try
                {
                    Sentence parsedSentence = pu.ParsedSentenceFactory(normalized, sentence);

                    //////// TP
                    try
                    {
                        spitBackSb.AppendLine(lineNumber + parsedSentence.ToString("g", dialect).ToHtml() + "<br/>");
                    }
                    catch (Exception ex)
                    {
                        string error = "[[CANNOT REPEAT BACK:  " + ex.Message + " for " + sentence + "]]";
                        spitBackSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>");
                        spitBackSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>");
                        UpdateErrors(errors, error, sentence);
                    }

                    try
                    {
                        //string result = parsedSentence.ToString("html", dialect);
                        //if (result.Replace("<span", "").Contains("<"))
                        //{
                        //    throw new InvalidOperationException("No HTML allowed in input");
                        //}
                        colorized.AppendLine(lineNumber + parsedSentence.ToString("html", dialect) + "<br/>");
                    }
                    catch (Exception ex)
                    {
                        string error = "[[CANNOT COLORIZE:  " + ex.Message + "]]";
                        spitBackSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>");
                        spitBackSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>");

                        UpdateErrors(errors, error, sentence);
                    }

                    //////// TP
                    try
                    {
                        bracketSb.AppendLine(lineNumber + hf.BoldTheWords(parsedSentence.ToString("b", dialect).ToHtml()) + "<br/>");
                    }
                    catch (Exception ex)
                    {
                        string error = "[[CANNOT BRACKET:  " + ex.Message + " for " + sentence + "]]";
                        bracketSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>");
                        bracketSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>");
                        UpdateErrors(errors, error, sentence);
                    }


                    //////// ENGLISH
                    try
                    {
                        dialect.TargetGloss = "en";
                        GlossMaker gm      = new GlossMaker();
                        string     glossed = gm.Gloss(normalized, sentence, "en", false);
                        glossSb.AppendLine(lineNumber + glossed.ToHtml() + "<br/>");
                        glossed = gm.Gloss(normalized, sentence, "en", true);
                        posSb.AppendLine(lineNumber + glossed.ToHtml() + "<br/>"); //bs doesn't do anything.
                    }
                    catch (Exception ex)
                    {
                        string error = "[[CANNOT GLOSS:  " + ex.Message.ToHtml() + " for " + sentence.ToHtml() + "]]";
                        glossSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>");
                        glossSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>");

                        posSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>");
                        posSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>");

                        UpdateErrors(errors, error, sentence);
                    }
                }
                catch (Exception ex)
                {
                    string error = "[[CANNOT Parse:  " + ex.Message.ToHtml() + "]]";

                    foreach (StringBuilder sb in new StringBuilder[] { //normalizedSb,
                        spitBackSb, bracketSb, posSb, glossSb, colorized
                    })
                    {
                        sb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>");
                        sb.Append(sentence.ToHtml() + "<br/>");
                    }

                    UpdateErrors(errors, error, sentence);
                }
                finally
                {
                    dialect.TargetGloss = "tp";
                }
                i++;
            }

            parse.Normalized   = normalizedSb.ToString();
            parse.Recovered    = spitBackSb.ToString();
            parse.Formatted    = bracketSb.ToString();
            parse.FormattedPos = hf.SubThePartsOfSpeech(posSb.ToString());
            parse.Glossed      = glossSb.ToString();
            parse.Colorized    = colorized.ToString();
            parse.Errors       = errors.ToString();
        }
        public void StressTestNormalize_VocativeImperatives()
        {
            int         i       = 0;
            Dialect     dialect = Dialect.LooseyGoosey;
            ParserUtils pu      = new ParserUtils(dialect);

            Dialect english = Dialect.LooseyGoosey;

            english.TargetGloss        = "en";
            english.GlossWithFallBacks = true;


            Normalizer norm = new Normalizer(dialect);

            CorpusFileReader reader = new CorpusFileReader();
            //GlossMaker gm = new GlossMaker();
            SentenceSplitter ss = new SentenceSplitter(dialect);

            foreach (string s in reader.NextFile())
            {
                foreach (string original in ss.ParseIntoNonNormalizedSentences(s))
                {
                    Sentence structured = null;
                    try
                    {
                        string normalized = norm.NormalizeText(original);

                        if (string.IsNullOrWhiteSpace(normalized))
                        {
                            continue;
                        }
                        if (!(normalized.ContainsWholeWord("o")))
                        {
                            continue;
                        }
                        if (!(normalized.ContainsWholeWord("li")))
                        {
                            continue;
                        }
                        if ((normalized.StartsWith("o ")))
                        {
                            continue;                                //These seem to be okay
                        }
                        if (normalized.ContainsCheck("Kinla"))
                        {
                            continue;                                   //Has a logical operator in one of the sample sentences that I can't deal with yet, unrelated to kin, ala
                        }
                        structured = pu.ParsedSentenceFactory(normalized, original);
                        string diag = structured.ToString("b");

                        Console.WriteLine("O: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' }));
                        Console.WriteLine("B: " + diag);
                        Console.WriteLine("...");

                        //Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english));
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine("FAILED : " + original);
                        if (structured != null)
                        {
                            Console.WriteLine(structured.ToString("b"));
                        }
                        Console.WriteLine(ex.Message);
                        i++;
                    }
                }
            }
            Console.WriteLine("Failed Sentences: " + i);
            Assert.AreEqual(0, i);
        }
        public void TransitionMatrix()
        {
            int     i       = 0;
            Dialect dialect = Dialect.LooseyGoosey;

            dialect.InferCompoundsPrepositionsForeignText = false;
            dialect.InferNumbers = true;
            dialect.NumberType   = "Body";
            Normalizer       norm   = new Normalizer(dialect);
            CorpusFileReader reader = new CorpusFileReader();
            SentenceSplitter ss     = new SentenceSplitter(dialect);
            ParserUtils      pu     = new ParserUtils(dialect);

            Dictionary <string, Dictionary <string, int> > matrix = new Dictionary <string, Dictionary <string, int> >(125);

            foreach (KeyValuePair <string, Word> pair in Words.Dictionary)
            {
                if (Word.Deprecated.Contains(pair.Key))
                {
                    continue;
                }
                Dictionary <string, int> following = new Dictionary <string, int>();
                foreach (KeyValuePair <string, Word> inner in Words.Dictionary)
                {
                    if (!Word.Deprecated.Contains(inner.Key))
                    {
                        following.Add(inner.Key, 0);
                    }
                }

                matrix.Add(pair.Key, following);
            }

            foreach (string s in reader.NextFile())
            {
                foreach (string original in ss.ParseIntoNonNormalizedSentences(s))
                {
                    try
                    {
                        string normalized = norm.NormalizeText(original);

                        Sentence      structured = pu.ParsedSentenceFactory(normalized, original);
                        string        restringed = structured.ToString("g");
                        List <string> parts      = restringed.Split(new[] { ' ' }).ToList();

                        string last    = null;
                        string current = null;
                        foreach (string part in parts)
                        {
                            current = part;
                            if (last == null)
                            {
                                //Can't do anything yet.
                            }
                            else
                            {
                                if (matrix.ContainsKey(last))
                                {
                                    Dictionary <string, int> transitionScores = matrix[last];
                                    if (transitionScores.ContainsKey(current))
                                    {
                                        transitionScores[current]++;
                                    }
                                }
                            }

                            last = current;
                        }
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine("ORIGINAL  : " + original);
                        Console.WriteLine(ex.Message);
                        i++;
                    }
                }
            }
            Console.WriteLine("Failed Sentences: " + i);

            StringBuilder sb     = new StringBuilder();
            int           header = 0;

            foreach (KeyValuePair <string, Dictionary <string, int> > pair in matrix)
            {
                if (header == 0)
                {
                    header++;
                    Console.Write("head\t");
                    foreach (var scores in pair.Value)
                    {
                        Console.Write(scores.Key + "\t");
                    }
                    Console.WriteLine();
                }
                Console.Write(pair.Key + "\t");
                foreach (var scores in pair.Value)
                {
                    if (scores.Value > 0)
                    {
                        Console.Write(scores.Value + "\t");
                    }
                    else
                    {
                        Console.Write("\t");
                    }
                }
                Console.WriteLine();
            }
        }
        public void FindProperModifiers()
        {
            int     i       = 0;
            Dialect dialect = Dialect.LooseyGoosey;

            dialect.InferCompoundsPrepositionsForeignText = false;
            ParserUtils pu = new ParserUtils(dialect);

            Normalizer               norm   = new Normalizer(dialect);
            CorpusFileReader         reader = new CorpusFileReader();
            Dictionary <string, int> words  = new Dictionary <string, int>(500);

            SentenceSplitter ss = new SentenceSplitter(dialect);

            foreach (string s in reader.NextFile())
            {
                string[] sentences = ss.ParseIntoNonNormalizedSentences(s);
                foreach (string original in sentences)
                {
                    try
                    {
                        string   normalized = norm.NormalizeText(original);
                        Sentence structured = pu.ParsedSentenceFactory(normalized, original);
                        //string diag = structured.ToString("b");


                        string stringified = structured.Subjects.ToString();
                        if (!stringified.Contains(" "))
                        {
                            continue;                            //single word
                        }
                        if (stringified.Contains(@""""))
                        {
                            continue;                             //foreign
                        }
                        if (stringified.StartsWith(@"nanpa"))
                        {
                            continue;                                  //implicit number
                        }
                        if (stringified.StartsWith(@"#"))
                        {
                            continue;                              //explicit number by punctuation
                        }
                        if (stringified.ContainsLetter(Token.AlphabetUpper))
                        {
                            if (words.ContainsKey(stringified))
                            {
                                words[stringified] = words[stringified] + 1;
                            }
                            else
                            {
                                words.Add(stringified, 1);
                                Console.WriteLine(i + " : " + stringified);
                            }
                        }
                    }
                    catch (Exception)
                    {
                        i++;
                    }
                }
            }
            foreach (KeyValuePair <string, int> pair in words.OrderBy(x => x.Value))
            {
                Console.WriteLine(pair.Key + " : " + pair.Value);
            }
        }
        public void StressTestNormalize_Pronouns()
        {
            int         i       = 0;
            Dialect     dialect = Dialect.LooseyGoosey;
            ParserUtils pu      = new ParserUtils(dialect);

            Normalizer norm = new Normalizer(dialect);

            Dialect english = Dialect.LooseyGoosey;

            english.TargetGloss        = "en";
            english.GlossWithFallBacks = true;

            CorpusFileReader reader = new CorpusFileReader();
            //GlossMaker gm = new GlossMaker();
            SentenceSplitter ss = new SentenceSplitter(dialect);

            foreach (string s in reader.NextFile())
            {
                foreach (string original in ss.ParseIntoNonNormalizedSentences(s))
                {
                    //try
                    //{
                    string normalized = norm.NormalizeText(original);

                    if (!(normalized.ContainsWholeWord("mi") || normalized.ContainsWholeWord("sina") || normalized.ContainsWholeWord("ona")))
                    {
                        continue;
                    }
                    if (normalized.ContainsCheck("Kinla"))
                    {
                        continue;                                   //Has a logical operator in one of the sample sentences that I can't deal with yet, unrelated to kin, ala
                    }
                    if (normalized.ContainsCheck("o,"))
                    {
                        continue;                                //Haven't dealt with vocatives yet.
                    }
                    if (normalized.ContainsCheck(" li pi "))
                    {
                        continue;                                     //Will deal with these when I feel like it.
                    }
                    if (normalized.ContainsCheck("ona li alasa pona"))
                    {
                        return;                                               //Okay, this is some randome point in the middle. 100s is enough!
                    }
                    Sentence structured = pu.ParsedSentenceFactory(normalized, original);

                    bool foundInteresting = false;
                    if (structured.Subjects != null)
                    {
                        if (structured.Subjects.ComplexChains != null)
                        {
                            foreach (ComplexChain innerComplexChain in structured.Subjects.ComplexChains)
                            {
                                if (innerComplexChain.SubChains == null)
                                {
                                    continue;
                                }

                                foreach (Chain subChain in innerComplexChain.SubChains)
                                {
                                    if (subChain.HeadedPhrases == null)
                                    {
                                        continue;
                                    }

                                    foreach (HeadedPhrase headedPhrase in subChain.HeadedPhrases)
                                    {
                                        if (headedPhrase.Modifiers == null || headedPhrase.Modifiers.Count == 0)
                                        {
                                            continue;
                                        }
                                        if (headedPhrase.Head.Text == "mi" || headedPhrase.Head.Text == "sina" ||
                                            headedPhrase.Head.Text == "ona")
                                        {
                                            Console.WriteLine("Found  : " + headedPhrase);
                                            foundInteresting = true;
                                        }
                                    }
                                }
                            }
                        }

                        if (structured.Subjects.SubChains != null)
                        {
                            foreach (Chain subChain in structured.Subjects.SubChains)
                            {
                                if (subChain.HeadedPhrases == null)
                                {
                                    continue;
                                }

                                foreach (HeadedPhrase headedPhrase in subChain.HeadedPhrases)
                                {
                                    if (headedPhrase.Modifiers == null || headedPhrase.Modifiers.Count == 0)
                                    {
                                        continue;
                                    }
                                    if (headedPhrase.Head.Text == "mi" || headedPhrase.Head.Text == "sina" ||
                                        headedPhrase.Head.Text == "ona")
                                    {
                                        Console.WriteLine("Found  : " + headedPhrase);
                                        foundInteresting = true;
                                    }
                                }
                            }
                        }
                    }

                    if (!foundInteresting)
                    {
                        continue;
                    }


                    string diag = structured.ToString("b");

                    //if ((normalized.ContainsCheck("%ante"))) continue; //verb!

                    Console.WriteLine("O: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' }));
                    Console.WriteLine("B: " + diag);
                    //Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english));
                    //}
                    //catch (Exception ex)
                    //{
                    //    if (ex.Message.ContainsCheck("all tests"))
                    //    {
                    //        Console.WriteLine("ORIGINAL  : " + original);
                    //        if (structured != null)
                    //        {
                    //            Console.WriteLine(structured.ToString("b"));
                    //        }
                    //        Console.WriteLine(ex.Message);
                    //        i++;
                    //    }
                    //    else throw;
                    //}
                }
            }
            Console.WriteLine("Failed Sentences: " + i);
        }