Ejemplo n.º 1
0
        public Sentence(NullOrSymbols nullOrSymbols, SentenceDiagnostics diagnostics)
        {
            LaFragment = new List <Fragment>();

            this.nullOrSymbols = nullOrSymbols;
            this.diagnostics   = diagnostics;
        }
Ejemplo n.º 2
0
        public Sentence(Comment comment, SentenceDiagnostics diagnostics)
        {
            LaFragment = new List <Fragment>();

            this.degenerateComment = comment;
            this.diagnostics       = diagnostics;
        }
Ejemplo n.º 3
0
        public Sentence(SentenceDiagnostics diagnostics, Sentence[] preconditions = null, Sentence conclusion = null)
        {
            LaFragment = new List <Fragment>();
            if (preconditions != null && preconditions.Length > 0 && conclusion == null)
            {
                throw new TpSyntaxException("There must be a head sentence (conclusions) if there are preconditions.");
            }
            this.conclusion = conclusion;

            this.preconditions = preconditions;//Entire sentences.


            if (conclusion != null && conclusion.punctuation == null)
            {
                throw new TpSyntaxException("Conclusions require punctuation, if only through normalization");
            }
            if (preconditions != null)
            {
                foreach (Sentence precondition in preconditions)
                {
                    precondition.HeadSentence = conclusion;

                    if (precondition.punctuation != null)
                    {
                        throw new TpSyntaxException("Preconditions should have no punctuation.");
                    }
                }
            }
            this.diagnostics = diagnostics;
        }
Ejemplo n.º 4
0
        public Sentence(Fragment fragment, Punctuation punctuation, SentenceDiagnostics diagnostics)
        {
            LaFragment = new List <Fragment>();

            this.degenerateFragment = fragment;
            this.punctuation        = punctuation;

            this.diagnostics = diagnostics;
        }
Ejemplo n.º 5
0
        //Suggest that vocatives don't chain.  o jan o meli o soweli o => o! jan o! meli o! soweli o!
        public Sentence(Vocative vocative, Punctuation punctuation, SentenceDiagnostics diagnostics)
        {
            LaFragment = new List <Fragment>();

            this.degenerateVocative = vocative;
            this.punctuation        = punctuation;

            this.diagnostics = diagnostics;
        }
Ejemplo n.º 6
0
        public Sentence(Exclamation exclamation, Punctuation punctuation, SentenceDiagnostics diagnostics)
        {
            LaFragment = new List <Fragment>();

            this.degenerateExclamation = exclamation;
            this.punctuation           = punctuation;

            this.diagnostics = diagnostics;
        }
Ejemplo n.º 7
0
        //Simple Sentences
        public Sentence(ComplexChain subjects, PredicateList predicates, SentenceDiagnostics diagnostics, SentenceOptionalParts parts = null)
        {
            LaFragment      = new List <Fragment>();
            this.subjects   = subjects;   //only (*), o, en
            this.predicates = predicates; //only li, pi, en
            if (parts != null)
            {
                punctuation    = parts.Punctuation;
                tagConjunction = parts.Conjunction;
                tagQuestion    = parts.TagQuestion;
                headVocatives  = parts.HeadVocatives;
                isHortative    = parts.IsHortative;
            }

            this.diagnostics = diagnostics;
        }
Ejemplo n.º 8
0
        //This should only operate on normalized sentences.
        public Sentence ParsedSentenceFactory(string sentence, string original)
        {
            diagnostics = new SentenceDiagnostics(original, sentence);

            if (String.IsNullOrWhiteSpace(sentence))
            {
                return(new Sentence(new NullOrSymbols(original), diagnostics));
                //  throw new TpParseException("Do not give me a null sentence. Can't tell if null sentence is from input or got lost in translation");
            }

            //This may have already been done by the normalizer, but if not, no problem.
            if (sentence.Contains(" li pi "))
            {
                sentence = sentence.Replace(" li pi ", " li XXXXZiXXXX ");
            }
            ParserUtils.ThrowOnDoubleParticles(sentence, dialect);



            if (sentence.StartCheck(" "))
            {
                throw new TpParseException("Do not give me a sentence that leads with whitespace, I do not want to do defensive Trim() all day. (Call at least NormalizeExplict)");
            }

            if (sentence.StartCheck("///"))
            {
                Comment c = new Comment(sentence);
                return(new Sentence(c, diagnostics));
            }


            if (sentence.EndCheck(" li") || sentence.EndCheck(" li."))
            {
                throw new TpParseException("Something went wrong, sentence ends with li: " + original);
            }
            //Normalization is really expensive. We must stop calling it twice.
            //sentence = Normalizer.NormalizeText(sentence, config); //Any way to avoid calling this twice?

            //HACK: This is necessary (otherwise we have to deal with optional quotes starting, ending words)
            //But we'd rather do this on a sentence level in Discourse.
            bool startsQuotedSpeech;
            bool endsQuotedSpeech;

            if (sentence.StartCheck("«"))
            {
                startsQuotedSpeech = true;
                sentence           = sentence.Replace("«", " ").Trim();
            }
            if (sentence.EndCheck("»", "».", "»!") || sentence.EndCheck("»:", "»?"))
            {
                endsQuotedSpeech = true;
                sentence         = sentence.Replace("»", " ").Trim();
            }

            //TODO: do something with quoted speech. Big problem #1 it spans multiple sentences


            if (sentence.EndCheck(" "))
            {
                throw new TpParseException("Normalizer failed to trim: " + original);
            }

            //Get the final punctuation out or it will mess up parsing later.
            string      possiblePunctuation = sentence[sentence.Length - 1].ToString();
            Punctuation punctuation;

            if (Punctuation.TryParse(possiblePunctuation, out punctuation))
            {
                sentence = sentence.Substring(0, sentence.Length - 1);
            }


            //Square bracket sentence contains all others
            //[S]
            //F la [S]
            //S la [S]
            //F la S la [S]
            //Maximal.. maybe later
            //F la S la F la S  => (F la S ) la (F la [S])
            //F la S la S la F la S la S
            //[{F la S} la {S} la {F la S}] la <S>

            //Just dealing with la fragments

            Sentence        headSentence  = null;
            List <Sentence> preconditions = new List <Sentence>();

            string[] laParts = Splitters.SplitOnLa(sentence);

            //Degenerate sentences.
            if (laParts[laParts.Length - 1] == "la")
            {
                //We have a vocative sentence...
                Fragment fragment         = new Fragment(ProcessEnPiChain(laParts[0]));
                Sentence fragmentSentence = new Sentence(fragment, punctuation, diagnostics);
                return(fragmentSentence);
            }

            if (laParts.Length > 1)
            {
                int             i               = 0;
                List <Fragment> laFragments     = new List <Fragment>();
                Sentence        currentSentence = null;
                foreach (string subSentence in laParts.Reverse())
                {
                    i++;
                    if (i == 1)
                    {
                        //Head sentence.
                        // subSentence.StartCheck("la ") ? subSentence.Substring(3) : subSentence
                        string laLessString = subSentence.RemoveLeadingWholeWord("la");
                        headSentence = ProcessSimpleSentence(laLessString, punctuation, original);
                        continue; //Not dealing with "kin la!"
                    }

                    //Fragments & preconditions
                    const string liFinder = @"\bli\b";
                    Match        m        = Regex.Match(subSentence, liFinder);
                    if (m.Success)
                    {
                        //This is a sentence
                        //Maybe should recurse.
                        string laLessString = subSentence.RemoveLeadingWholeWord("la");

                        currentSentence = ProcessSimpleSentence(laLessString, null, original);
                        preconditions.Add(currentSentence);
                    }
                    else
                    {
                        string   laLessString = subSentence.RemoveLeadingWholeWord("la");
                        Fragment fragment;
                        if (laLessString.StartCheck("~"))
                        {
                            string[] parts = Splitters.SplitOnPrepositions(laLessString);
                            fragment = new Fragment(ProcessPrepositionalPhrases(parts).ToArray());
                        }
                        else
                        {
                            fragment = new Fragment(ProcessEnPiChain(laLessString));
                        }

                        if (currentSentence == null)
                        {
                            if (headSentence == null)
                            {
                                throw new TpParseException(
                                          "Sentence appears to be headed by a fragment. Shouldn't deal with those here.: " + original);
                            }
                            headSentence.LaFragment.Add(fragment);
                        }
                        else
                        {
                            laFragments.Add(fragment);
                        }
                    }
                }
            }
            else
            {
                //No la at all.
                //Simple Sentence
                return(ProcessSimpleSentence(sentence, punctuation, original));
            }
            if (headSentence == null)
            {
                throw new TpParseException("This is not a sentence, should deal with it with it's own parser: " + original);
            }
            if (preconditions.Count == 0)
            {
                return(headSentence);
            }
            Sentence s = new Sentence(diagnostics, preconditions.ToArray(), headSentence);

            return(s);
        }
Ejemplo n.º 9
0
        public string NormalizeText(string text) //= null
        {
            if (!dialect.InferCompoundsPrepositionsForeignText)
            {
                //HACK: Not the way this should work.
                NormalizeExplicit ex = new NormalizeExplicit(dialect);
                return(ex.NormalizeText(text));
            }
            SentenceDiagnostics sd = new SentenceDiagnostics(text, "N/A");

            //Nothing to parse.
            if (string.IsNullOrWhiteSpace(text) || NormalizationTasks.IsNullWhiteOrPunctuation(text))
            {
                return("");
            }

            //Don't normalize a comment.
            if (text.StartCheck("///") && !text.Contains("\n"))
            {
                return(text);
            }

            string normalized = NormalizationTasks.TimeWhiteSpaceAndSpaceBeforeSentenceTerminators(text);

            normalized = NormalizationTasks.RemoveInternalWhiteSpace(normalized);

            //Is this better early or later?
            if (normalized.Contains(@""""""))
            {
                normalized = normalized.Replace(@"""""", @"""");
            }

            //Hide tokens that otherwise have a different meaning.
            if (normalized.ContainsCheck(" li pi "))
            {
                normalized = normalized.Replace(" li pi ", " li XXXXZiXXXX ");
            }


            //  "/\\*.*?\\*/"
            // Things that cross sentences should already be deal with earlier.
            if (normalized.ContainsCheck("/*") && normalized.ContainsCheck("*/"))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "Comments", NormalizationTasks.StripMultilineComments);
            }

            //Process explicit explicit Foreign text. (this always happens)
            if (normalized.ContainsCheck("\""))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "ForeignSpace", NormalizationTasks.ProcessWhiteSpaceInForeignText, dialect);
            }

            //Process explict Foreign Text (this always happens)
            if (dialect.InferCompoundsPrepositionsForeignText)
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "Foreign", NormalizeForeignText.NormalizeImplicit, dialect);
            }

            //Hyphenated words. This could cause a problem for compound words that cross lines.
            if (normalized.ContainsCheck("-\n"))
            {
                normalized = normalized.Replace("-\n", "");
            }

            //can't cope with line breaks.
            if (normalized.ContainsCheck("\n"))
            {
                normalized = normalized.Replace("\n", " ");
            }
            if (normalized.ContainsCheck("\t"))
            {
                normalized = normalized.Replace("\t", " ");
            }

            //must be after - processing
            if (dialect.InferNumbers)
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "Numbers", NormalizeNumbers.FindNumbers, dialect);
            }



            //Extraneous punctuation-- TODO, expand to most other symbols.
            if (normalized.ContainsCheck("(") || normalized.ContainsCheck(")"))
            {
                normalized = normalized.Replace("(", "");
                normalized = normalized.Replace(")", "");
            }

            //Extraneous commas
            if (normalized.ContainsCheck(","))
            {
                //Benefit of the doubt. if you see , sama, ==> ~sama
                //Otherwise, assume it is garbage.
                foreach (string prep in Particles.Prepositions)
                {
                    if (normalized.ContainsCheck("," + prep))
                    {
                        normalized = normalized.Replace("," + prep, "~" + prep);
                    }
                    if (normalized.ContainsCheck(", " + prep))
                    {
                        normalized = normalized.Replace(", " + prep, " ~" + prep);
                    }
                }


                normalized = NormalizationTasks.ApplyNormalization(normalized, "ExtraCommas", NormalizationTasks.ProcessExtraneousCommas);
            }

            //Left overs from initial parsing.
            if (normalized.ContainsCheck("[NULL]"))
            {
#if DEBUG
                throw new NormalizationException("Stop adding [NULL] to normalized sentences.");
#else
                normalized = normalized.Replace("[NULL]", "");
#endif
            }
            //Normalize prepositions to ~, so that we don't have tokens with embedded spaces (e.g. foo, kepeken => [foo],[, kepeken])

            if (normalized.ContainsCheck(" "))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "ExtraWhiteSpace", NormalizationTasks.ProcessExtraneousWhiteSpace);
            }



            //Okay, phrases should be recognizable now.
            if (dialect.InferCompoundsPrepositionsForeignText)
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "Compounds", cw.ProcessCompoundWords);
            }


            if (dialect.InferCompoundsPrepositionsForeignText)
            {
                normalized = NormalizationTasks.MarkImplicitPrepositions(text, normalized);
            }

            //la o
            //invisible implicit subject.
            if (normalized.ContainsCheck(" la o "))
            {
                normalized = normalized.Replace(" la o ", " la jan Sanwan o ");
            }

            normalized = NormalizeMiSina.MiSinaProcessAndUndoOverNormalization(normalized);

            if (normalized.ContainsCheck("~"))
            {
                normalized = NormalizationTasks.ThoseArentPrepositions(normalized);
            }

            normalized = Regex.Replace(normalized, @"^\s+|\s+$", ""); //Remove extraneous whitespace


            //If it is a sentence fragment, I really can't deal with prep phrase that may or may not be in it.
            if (normalized.ContainsCheck("~") &&
                !normalized.ContainsCheck(" li ") && //full sentence okay
                !normalized.StartCheck("o ")    //imperative okay
                )
            {
                normalized = normalized.Replace("~", ""); //HACK: This may erase ~ added by user at the start?
            }

            normalized = NormalizeMiSina.ProcessMiSinaOvernormalizationWithPrepositions(normalized);


            normalized = NormalizeMiSina.ProcessMiSinaOverNormalizationWithoutPrepositions(text, normalized);

            //One off that comes back?
            foreach (string oneOff in new[] {
                "li ~lon poka e",                                          //place something next to
                "li ~tawa tu e"
            })
            {
                normalized = normalized.Replace(oneOff, oneOff.Replace("~", ""));
            }


            if (normalized.ContainsCheck("'"))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "DirectQuotes", NormalizationTasks.AddDirectedQuotes);
            }

            //Post conditions.
            if (normalized.StartCheck("« »"))
            {
                throw new NormalizationException("quote recognition went wrong: " + text);
            }


            //Probably added above by mistake
            normalized = NormalizationTasks.RemoveInternalWhiteSpace(normalized);
            normalized = NormalizationTasks.TimeWhiteSpaceAndSpaceBeforeSentenceTerminators(normalized);

            sd = new SentenceDiagnostics(text, normalized);
            return(normalized);
        }
Ejemplo n.º 10
0
        public string NormalizeText(string text) //= null
        {
            SentenceDiagnostics sd = new SentenceDiagnostics(text, "N/A");

            //Nothing to parse.
            if (string.IsNullOrWhiteSpace(text) || NormalizationTasks.IsNullWhiteOrPunctuation(text))
            {
                return("");
            }
            //Don't normalize a comment.
            if (text.StartCheck("///") && !text.Contains("\n"))
            {
                return(text);
            }

            string normalized = NormalizationTasks.TimeWhiteSpaceAndSpaceBeforeSentenceTerminators(text);

            normalized = NormalizationTasks.RemoveInternalWhiteSpace(normalized);

            //Is this better early or later?
            if (normalized.Contains(@""""""))
            {
                normalized = normalized.Replace(@"""""", @"""");
            }

            //Hide tokens that otherwise have a different meaning.
            if (normalized.ContainsCheck(" li pi "))
            {
                normalized = normalized.Replace(" li pi ", " li XXXXZiXXXX ");
            }


            //  "/\\*.*?\\*/"
            // Things that cross sentences should already be deal with earlier.
            if (normalized.ContainsCheck("/*") && normalized.ContainsCheck("*/"))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "Comments", NormalizationTasks.StripMultilineComments);
            }

            //Process explicit explicit Foreign text. (this always happens)
            if (normalized.ContainsCheck("\""))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "ForeignSpace", NormalizationTasks.ProcessWhiteSpaceInForeignText, dialect);
            }

            //Swap terminators (always happens)
            normalized = NormalizationTasks.ApplyNormalization(normalized, "Foreign", NormalizeForeignText.NormalizeExplicit, dialect);


            //Hyphenated words. This could cause a problem for compound words that cross lines.
            if (normalized.ContainsCheck("-\n"))
            {
                normalized = normalized.Replace("-\n", "");
            }

            //can't cope with line breaks.
            if (normalized.ContainsCheck("\n"))
            {
                normalized = normalized.Replace("\n", " ");
            }
            if (normalized.ContainsCheck("\t"))
            {
                normalized = normalized.Replace("\t", " ");
            }

            //must be after - processing
            //Don't infer numbers.

            //Extraneous punctuation-- TODO, expand to most other symbols.
            if (normalized.ContainsCheck("(") || normalized.ContainsCheck(")"))
            {
                normalized = normalized.Replace("(", "");
                normalized = normalized.Replace(")", "");
            }

            //Extraneous commas... not sure, we'd like some to go away, but we want ,lon ,sama etc to stay.
            //if (normalized.ContainsCheck(","))
            //{
            //    normalized = NormalizationTasks.ApplyNormalization(normalized, "ExtraCommas", NormalizationTasks.ProcessExtraneousCommas);
            //}

            //Normalize prepositions to ~, so that we don't have tokens with embedded spaces (e.g. foo, kepeken => [foo],[, kepeken])

            if (normalized.ContainsCheck(","))
            {
                foreach (string prep in Particles.Prepositions)
                {
                    if (normalized.ContainsCheck("," + prep))
                    {
                        normalized = normalized.Replace("," + prep, " ~" + prep);
                    }
                    if (normalized.ContainsCheck(", " + prep))
                    {
                        normalized = normalized.Replace(", " + prep, " ~" + prep);
                    }
                }
            }

            if (normalized.ContainsCheck(" "))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "ExtraWhiteSpace", NormalizationTasks.ProcessExtraneousWhiteSpace);
            }



            //Okay, phrases should be recognizable now.
            //Don't infer compound words

            //if (dialect.InferCompoundsPrepositionsForeignText)
            //{
            //    normalized = NormalizationTasks.MarkImplicitPrepositions(text, normalized);
            //}

            //la o
            //invisible implicit subject.
            if (normalized.ContainsCheck(" la o "))
            {
                normalized = normalized.Replace(" la o ", " la jan Sanwan o ");
            }

            normalized = NormalizeMiSina.MiSinaProcessAndUndoOverNormalization(normalized);

            normalized = Regex.Replace(normalized, @"^\s+|\s+$", ""); //Remove extraneous whitespace

            normalized = NormalizeMiSina.ProcessMiSinaOvernormalizationWithPrepositions(normalized);

            normalized = NormalizeMiSina.ProcessMiSinaOverNormalizationWithoutPrepositions(text, normalized);

            if (normalized.ContainsCheck("'"))
            {
                normalized = NormalizationTasks.ApplyNormalization(normalized, "DirectQuotes", NormalizationTasks.AddDirectedQuotes);
            }

            //Probably added above by mistake
            normalized = NormalizationTasks.RemoveInternalWhiteSpace(normalized);
            normalized = NormalizationTasks.TimeWhiteSpaceAndSpaceBeforeSentenceTerminators(normalized);

            //Post conditions.
            if (normalized.StartCheck("« »"))
            {
                throw new NormalizationException("quote recognition went wrong: " + text);
            }

            sd = new SentenceDiagnostics(text, normalized);
            return(normalized);
        }