public void ProcessCalmVocative_JustRunIsVocative()
        {
            const string vocative = "jan Oliwa o, o, o.";
            bool         isIt     = NormalizationTasks.IsVocative(vocative);

            Assert.IsTrue(isIt, "Expected to ID a vocative.");
        }
        public string NormalizeText(string text) //= null
        {
            if (!dialect.InferCompoundsPrepositionsForeignText)
            {
                //HACK: Not the way this should work.
                NormalizeExplicit ex = new NormalizeExplicit(dialect);
                return(ex.NormalizeText(text));
            }
            SentenceDiagnostics sd = new SentenceDiagnostics(text, "N/A");

            //Nothing to parse.
            if (string.IsNullOrWhiteSpace(text) || NormalizationTasks.IsNullWhiteOrPunctuation(text))
            {
                return("");
            }

            //Don't normalize a comment.
            if (text.StartCheck("///") && !text.Contains("\n"))
            {
                return(text);
            }

            string normalized = NormalizationTasks.TimeWhiteSpaceAndSpaceBeforeSentenceTerminators(text);

            normalized = NormalizationTasks.RemoveInternalWhiteSpace(normalized);

            //Is this better early or later?
            if (normalized.Contains(@""""""))
            {
                normalized = normalized.Replace(@"""""", @"""");
            }

            //Hide tokens that otherwise have a different meaning.
            if (normalized.ContainsCheck(" li pi "))
            {
                normalized = normalized.Replace(" li pi ", " li XXXXZiXXXX ");
            }


            //  "/\\*.*?\\*/"
            // Things that cross sentences should already be deal with earlier.
            if (normalized.ContainsCheck("/*") && normalized.ContainsCheck("*/"))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "Comments", NormalizationTasks.StripMultilineComments);
            }

            //Process explicit explicit Foreign text. (this always happens)
            if (normalized.ContainsCheck("\""))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "ForeignSpace", NormalizationTasks.ProcessWhiteSpaceInForeignText, dialect);
            }

            //Process explict Foreign Text (this always happens)
            if (dialect.InferCompoundsPrepositionsForeignText)
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "Foreign", NormalizeForeignText.NormalizeImplicit, dialect);
            }

            //Hyphenated words. This could cause a problem for compound words that cross lines.
            if (normalized.ContainsCheck("-\n"))
            {
                normalized = normalized.Replace("-\n", "");
            }

            //can't cope with line breaks.
            if (normalized.ContainsCheck("\n"))
            {
                normalized = normalized.Replace("\n", " ");
            }
            if (normalized.ContainsCheck("\t"))
            {
                normalized = normalized.Replace("\t", " ");
            }

            //must be after - processing
            if (dialect.InferNumbers)
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "Numbers", NormalizeNumbers.FindNumbers, dialect);
            }



            //Extraneous punctuation-- TODO, expand to most other symbols.
            if (normalized.ContainsCheck("(") || normalized.ContainsCheck(")"))
            {
                normalized = normalized.Replace("(", "");
                normalized = normalized.Replace(")", "");
            }

            //Extraneous commas
            if (normalized.ContainsCheck(","))
            {
                //Benefit of the doubt. if you see , sama, ==> ~sama
                //Otherwise, assume it is garbage.
                foreach (string prep in Particles.Prepositions)
                {
                    if (normalized.ContainsCheck("," + prep))
                    {
                        normalized = normalized.Replace("," + prep, "~" + prep);
                    }
                    if (normalized.ContainsCheck(", " + prep))
                    {
                        normalized = normalized.Replace(", " + prep, " ~" + prep);
                    }
                }


                normalized = NormalizationTasks.ApplyNormalization(normalized, "ExtraCommas", NormalizationTasks.ProcessExtraneousCommas);
            }

            //Left overs from initial parsing.
            if (normalized.ContainsCheck("[NULL]"))
            {
#if DEBUG
                throw new NormalizationException("Stop adding [NULL] to normalized sentences.");
#else
                normalized = normalized.Replace("[NULL]", "");
#endif
            }
            //Normalize prepositions to ~, so that we don't have tokens with embedded spaces (e.g. foo, kepeken => [foo],[, kepeken])

            if (normalized.ContainsCheck(" "))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "ExtraWhiteSpace", NormalizationTasks.ProcessExtraneousWhiteSpace);
            }



            //Okay, phrases should be recognizable now.
            if (dialect.InferCompoundsPrepositionsForeignText)
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "Compounds", cw.ProcessCompoundWords);
            }


            if (dialect.InferCompoundsPrepositionsForeignText)
            {
                normalized = NormalizationTasks.MarkImplicitPrepositions(text, normalized);
            }

            //la o
            //invisible implicit subject.
            if (normalized.ContainsCheck(" la o "))
            {
                normalized = normalized.Replace(" la o ", " la jan Sanwan o ");
            }

            normalized = NormalizeMiSina.MiSinaProcessAndUndoOverNormalization(normalized);

            if (normalized.ContainsCheck("~"))
            {
                normalized = NormalizationTasks.ThoseArentPrepositions(normalized);
            }

            normalized = Regex.Replace(normalized, @"^\s+|\s+$", ""); //Remove extraneous whitespace


            //If it is a sentence fragment, I really can't deal with prep phrase that may or may not be in it.
            if (normalized.ContainsCheck("~") &&
                !normalized.ContainsCheck(" li ") && //full sentence okay
                !normalized.StartCheck("o ")    //imperative okay
                )
            {
                normalized = normalized.Replace("~", ""); //HACK: This may erase ~ added by user at the start?
            }

            normalized = NormalizeMiSina.ProcessMiSinaOvernormalizationWithPrepositions(normalized);


            normalized = NormalizeMiSina.ProcessMiSinaOverNormalizationWithoutPrepositions(text, normalized);

            //One off that comes back?
            foreach (string oneOff in new[] {
                "li ~lon poka e",                                          //place something next to
                "li ~tawa tu e"
            })
            {
                normalized = normalized.Replace(oneOff, oneOff.Replace("~", ""));
            }


            if (normalized.ContainsCheck("'"))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "DirectQuotes", NormalizationTasks.AddDirectedQuotes);
            }

            //Post conditions.
            if (normalized.StartCheck("« »"))
            {
                throw new NormalizationException("quote recognition went wrong: " + text);
            }


            //Probably added above by mistake
            normalized = NormalizationTasks.RemoveInternalWhiteSpace(normalized);
            normalized = NormalizationTasks.TimeWhiteSpaceAndSpaceBeforeSentenceTerminators(normalized);

            sd = new SentenceDiagnostics(text, normalized);
            return(normalized);
        }
Example #3
0
        public string NormalizeText(string text) //= null
        {
            SentenceDiagnostics sd = new SentenceDiagnostics(text, "N/A");

            //Nothing to parse.
            if (string.IsNullOrWhiteSpace(text) || NormalizationTasks.IsNullWhiteOrPunctuation(text))
            {
                return("");
            }
            //Don't normalize a comment.
            if (text.StartCheck("///") && !text.Contains("\n"))
            {
                return(text);
            }

            string normalized = NormalizationTasks.TimeWhiteSpaceAndSpaceBeforeSentenceTerminators(text);

            normalized = NormalizationTasks.RemoveInternalWhiteSpace(normalized);

            //Is this better early or later?
            if (normalized.Contains(@""""""))
            {
                normalized = normalized.Replace(@"""""", @"""");
            }

            //Hide tokens that otherwise have a different meaning.
            if (normalized.ContainsCheck(" li pi "))
            {
                normalized = normalized.Replace(" li pi ", " li XXXXZiXXXX ");
            }


            //  "/\\*.*?\\*/"
            // Things that cross sentences should already be deal with earlier.
            if (normalized.ContainsCheck("/*") && normalized.ContainsCheck("*/"))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "Comments", NormalizationTasks.StripMultilineComments);
            }

            //Process explicit explicit Foreign text. (this always happens)
            if (normalized.ContainsCheck("\""))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "ForeignSpace", NormalizationTasks.ProcessWhiteSpaceInForeignText, dialect);
            }

            //Swap terminators (always happens)
            normalized = NormalizationTasks.ApplyNormalization(normalized, "Foreign", NormalizeForeignText.NormalizeExplicit, dialect);


            //Hyphenated words. This could cause a problem for compound words that cross lines.
            if (normalized.ContainsCheck("-\n"))
            {
                normalized = normalized.Replace("-\n", "");
            }

            //can't cope with line breaks.
            if (normalized.ContainsCheck("\n"))
            {
                normalized = normalized.Replace("\n", " ");
            }
            if (normalized.ContainsCheck("\t"))
            {
                normalized = normalized.Replace("\t", " ");
            }

            //must be after - processing
            //Don't infer numbers.

            //Extraneous punctuation-- TODO, expand to most other symbols.
            if (normalized.ContainsCheck("(") || normalized.ContainsCheck(")"))
            {
                normalized = normalized.Replace("(", "");
                normalized = normalized.Replace(")", "");
            }

            //Extraneous commas... not sure, we'd like some to go away, but we want ,lon ,sama etc to stay.
            //if (normalized.ContainsCheck(","))
            //{
            //    normalized = NormalizationTasks.ApplyNormalization(normalized, "ExtraCommas", NormalizationTasks.ProcessExtraneousCommas);
            //}

            //Normalize prepositions to ~, so that we don't have tokens with embedded spaces (e.g. foo, kepeken => [foo],[, kepeken])

            if (normalized.ContainsCheck(","))
            {
                foreach (string prep in Particles.Prepositions)
                {
                    if (normalized.ContainsCheck("," + prep))
                    {
                        normalized = normalized.Replace("," + prep, " ~" + prep);
                    }
                    if (normalized.ContainsCheck(", " + prep))
                    {
                        normalized = normalized.Replace(", " + prep, " ~" + prep);
                    }
                }
            }

            if (normalized.ContainsCheck(" "))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "ExtraWhiteSpace", NormalizationTasks.ProcessExtraneousWhiteSpace);
            }



            //Okay, phrases should be recognizable now.
            //Don't infer compound words

            //if (dialect.InferCompoundsPrepositionsForeignText)
            //{
            //    normalized = NormalizationTasks.MarkImplicitPrepositions(text, normalized);
            //}

            //la o
            //invisible implicit subject.
            if (normalized.ContainsCheck(" la o "))
            {
                normalized = normalized.Replace(" la o ", " la jan Sanwan o ");
            }

            normalized = NormalizeMiSina.MiSinaProcessAndUndoOverNormalization(normalized);

            normalized = Regex.Replace(normalized, @"^\s+|\s+$", ""); //Remove extraneous whitespace

            normalized = NormalizeMiSina.ProcessMiSinaOvernormalizationWithPrepositions(normalized);

            normalized = NormalizeMiSina.ProcessMiSinaOverNormalizationWithoutPrepositions(text, normalized);

            if (normalized.ContainsCheck("'"))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "DirectQuotes", NormalizationTasks.AddDirectedQuotes);
            }

            //Probably added above by mistake
            normalized = NormalizationTasks.RemoveInternalWhiteSpace(normalized);
            normalized = NormalizationTasks.TimeWhiteSpaceAndSpaceBeforeSentenceTerminators(normalized);

            //Post conditions.
            if (normalized.StartCheck("« »"))
            {
                throw new NormalizationException("quote recognition went wrong: " + text);
            }

            sd = new SentenceDiagnostics(text, normalized);
            return(normalized);
        }
Example #4
0
        public static string MiSinaProcessAndUndoOverNormalization(string normalized)
        {
            //TODO: detect start of sentence & replace mi X and sina Y with

            if (normalized.ContainsCheck("mi"))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "mi li", ProcessMi);
            }

            if (normalized.ContainsCheck("sina"))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "sina li", ProcessSina);
            }


            Dictionary <string, string> pronounModifiersMap = new Dictionary <string, string>
            {
                { "mi wan li ", "mi li wan li " },
                { "mi tu li ", "mi li tu li " },
                { "mi mute li ", "mi li mute li " },
                { "mi suli li ", "mi li suli li " },
                { "sina wan li ", "sina li wan li " },
                { "sina tu li ", "sina li tu li " },
                { "sina mute li ", "sina li mute li " },
                { "sina suli li ", "sina li suli li " },
                { "mi en sina li ", "mi li en sina li" }
            };

            //undo overnormalization
            foreach (KeyValuePair <string, string> pair in pronounModifiersMap)
            {
                if (normalized.ContainsCheck(pair.Value))
                {
                    normalized = normalized.Replace(pair.Value, pair.Key);
                }
            }


            if (normalized.ContainsCheck("la mi"))
            {
                bool dontTouch = false;
                foreach (string pronounModifier in pronounModifiers)
                {
                    if (normalized.ContainsCheck("la " + pronounModifier))
                    {
                        dontTouch = true;
                    }
                }
                if (!dontTouch)
                {
                    normalized = Regex.Replace(normalized, @"\bla mi\b", "la mi li"); //normalize contractions

                    //If original was, say, "kin la mi li pali", we get a double li li
                    if (normalized.ContainsCheck(" li li "))
                    {
                        //undo doubling.
                        normalized = Regex.Replace(normalized, @"\bli li\b", "li"); //normalize contractions
                    }
                }
            }


            if (normalized.ContainsCheck("la sina"))
            {
                bool dontTouch = false;
                foreach (string pronounModifier in pronounModifiers)
                {
                    if (normalized.ContainsCheck("la " + pronounModifier))
                    {
                        dontTouch = true;
                    }
                }
                if (!dontTouch)
                {
                    normalized = Regex.Replace(normalized, @"\bla sina\b", "la sina li"); //normalize contractions

                    //If original was, say, "kin la sina li pali", we get a double li li
                    if (normalized.ContainsCheck(" li li "))
                    {
                        //undo doubling.
                        normalized = Regex.Replace(normalized, @"\bli li\b", "li"); //normalize contractions
                    }
                }
            }
            return(normalized);
        }