public Sentence(NullOrSymbols nullOrSymbols, SentenceDiagnostics diagnostics) { LaFragment = new List <Fragment>(); this.nullOrSymbols = nullOrSymbols; this.diagnostics = diagnostics; }
public Sentence(Comment comment, SentenceDiagnostics diagnostics) { LaFragment = new List <Fragment>(); this.degenerateComment = comment; this.diagnostics = diagnostics; }
public Sentence(SentenceDiagnostics diagnostics, Sentence[] preconditions = null, Sentence conclusion = null) { LaFragment = new List <Fragment>(); if (preconditions != null && preconditions.Length > 0 && conclusion == null) { throw new TpSyntaxException("There must be a head sentence (conclusions) if there are preconditions."); } this.conclusion = conclusion; this.preconditions = preconditions;//Entire sentences. if (conclusion != null && conclusion.punctuation == null) { throw new TpSyntaxException("Conclusions require punctuation, if only through normalization"); } if (preconditions != null) { foreach (Sentence precondition in preconditions) { precondition.HeadSentence = conclusion; if (precondition.punctuation != null) { throw new TpSyntaxException("Preconditions should have no punctuation."); } } } this.diagnostics = diagnostics; }
public Sentence(Fragment fragment, Punctuation punctuation, SentenceDiagnostics diagnostics) { LaFragment = new List <Fragment>(); this.degenerateFragment = fragment; this.punctuation = punctuation; this.diagnostics = diagnostics; }
//Suggest that vocatives don't chain. o jan o meli o soweli o => o! jan o! meli o! soweli o! public Sentence(Vocative vocative, Punctuation punctuation, SentenceDiagnostics diagnostics) { LaFragment = new List <Fragment>(); this.degenerateVocative = vocative; this.punctuation = punctuation; this.diagnostics = diagnostics; }
public Sentence(Exclamation exclamation, Punctuation punctuation, SentenceDiagnostics diagnostics) { LaFragment = new List <Fragment>(); this.degenerateExclamation = exclamation; this.punctuation = punctuation; this.diagnostics = diagnostics; }
//Simple Sentences public Sentence(ComplexChain subjects, PredicateList predicates, SentenceDiagnostics diagnostics, SentenceOptionalParts parts = null) { LaFragment = new List <Fragment>(); this.subjects = subjects; //only (*), o, en this.predicates = predicates; //only li, pi, en if (parts != null) { punctuation = parts.Punctuation; tagConjunction = parts.Conjunction; tagQuestion = parts.TagQuestion; headVocatives = parts.HeadVocatives; isHortative = parts.IsHortative; } this.diagnostics = diagnostics; }
//This should only operate on normalized sentences. public Sentence ParsedSentenceFactory(string sentence, string original) { diagnostics = new SentenceDiagnostics(original, sentence); if (String.IsNullOrWhiteSpace(sentence)) { return(new Sentence(new NullOrSymbols(original), diagnostics)); // throw new TpParseException("Do not give me a null sentence. Can't tell if null sentence is from input or got lost in translation"); } //This may have already been done by the normalizer, but if not, no problem. if (sentence.Contains(" li pi ")) { sentence = sentence.Replace(" li pi ", " li XXXXZiXXXX "); } ParserUtils.ThrowOnDoubleParticles(sentence, dialect); if (sentence.StartCheck(" ")) { throw new TpParseException("Do not give me a sentence that leads with whitespace, I do not want to do defensive Trim() all day. (Call at least NormalizeExplict)"); } if (sentence.StartCheck("///")) { Comment c = new Comment(sentence); return(new Sentence(c, diagnostics)); } if (sentence.EndCheck(" li") || sentence.EndCheck(" li.")) { throw new TpParseException("Something went wrong, sentence ends with li: " + original); } //Normalization is really expensive. We must stop calling it twice. //sentence = Normalizer.NormalizeText(sentence, config); //Any way to avoid calling this twice? //HACK: This is necessary (otherwise we have to deal with optional quotes starting, ending words) //But we'd rather do this on a sentence level in Discourse. bool startsQuotedSpeech; bool endsQuotedSpeech; if (sentence.StartCheck("«")) { startsQuotedSpeech = true; sentence = sentence.Replace("«", " ").Trim(); } if (sentence.EndCheck("»", "».", "»!") || sentence.EndCheck("»:", "»?")) { endsQuotedSpeech = true; sentence = sentence.Replace("»", " ").Trim(); } //TODO: do something with quoted speech. Big problem #1 it spans multiple sentences if (sentence.EndCheck(" ")) { throw new TpParseException("Normalizer failed to trim: " + original); } //Get the final punctuation out or it will mess up parsing later. string possiblePunctuation = sentence[sentence.Length - 1].ToString(); Punctuation punctuation; if (Punctuation.TryParse(possiblePunctuation, out punctuation)) { sentence = sentence.Substring(0, sentence.Length - 1); } //Square bracket sentence contains all others //[S] //F la [S] //S la [S] //F la S la [S] //Maximal.. maybe later //F la S la F la S => (F la S ) la (F la [S]) //F la S la S la F la S la S //[{F la S} la {S} la {F la S}] la <S> //Just dealing with la fragments Sentence headSentence = null; List <Sentence> preconditions = new List <Sentence>(); string[] laParts = Splitters.SplitOnLa(sentence); //Degenerate sentences. if (laParts[laParts.Length - 1] == "la") { //We have a vocative sentence... Fragment fragment = new Fragment(ProcessEnPiChain(laParts[0])); Sentence fragmentSentence = new Sentence(fragment, punctuation, diagnostics); return(fragmentSentence); } if (laParts.Length > 1) { int i = 0; List <Fragment> laFragments = new List <Fragment>(); Sentence currentSentence = null; foreach (string subSentence in laParts.Reverse()) { i++; if (i == 1) { //Head sentence. // subSentence.StartCheck("la ") ? subSentence.Substring(3) : subSentence string laLessString = subSentence.RemoveLeadingWholeWord("la"); headSentence = ProcessSimpleSentence(laLessString, punctuation, original); continue; //Not dealing with "kin la!" } //Fragments & preconditions const string liFinder = @"\bli\b"; Match m = Regex.Match(subSentence, liFinder); if (m.Success) { //This is a sentence //Maybe should recurse. string laLessString = subSentence.RemoveLeadingWholeWord("la"); currentSentence = ProcessSimpleSentence(laLessString, null, original); preconditions.Add(currentSentence); } else { string laLessString = subSentence.RemoveLeadingWholeWord("la"); Fragment fragment; if (laLessString.StartCheck("~")) { string[] parts = Splitters.SplitOnPrepositions(laLessString); fragment = new Fragment(ProcessPrepositionalPhrases(parts).ToArray()); } else { fragment = new Fragment(ProcessEnPiChain(laLessString)); } if (currentSentence == null) { if (headSentence == null) { throw new TpParseException( "Sentence appears to be headed by a fragment. Shouldn't deal with those here.: " + original); } headSentence.LaFragment.Add(fragment); } else { laFragments.Add(fragment); } } } } else { //No la at all. //Simple Sentence return(ProcessSimpleSentence(sentence, punctuation, original)); } if (headSentence == null) { throw new TpParseException("This is not a sentence, should deal with it with it's own parser: " + original); } if (preconditions.Count == 0) { return(headSentence); } Sentence s = new Sentence(diagnostics, preconditions.ToArray(), headSentence); return(s); }
public string NormalizeText(string text) //= null { if (!dialect.InferCompoundsPrepositionsForeignText) { //HACK: Not the way this should work. NormalizeExplicit ex = new NormalizeExplicit(dialect); return(ex.NormalizeText(text)); } SentenceDiagnostics sd = new SentenceDiagnostics(text, "N/A"); //Nothing to parse. if (string.IsNullOrWhiteSpace(text) || NormalizationTasks.IsNullWhiteOrPunctuation(text)) { return(""); } //Don't normalize a comment. if (text.StartCheck("///") && !text.Contains("\n")) { return(text); } string normalized = NormalizationTasks.TimeWhiteSpaceAndSpaceBeforeSentenceTerminators(text); normalized = NormalizationTasks.RemoveInternalWhiteSpace(normalized); //Is this better early or later? if (normalized.Contains(@"""""")) { normalized = normalized.Replace(@"""""", @""""); } //Hide tokens that otherwise have a different meaning. if (normalized.ContainsCheck(" li pi ")) { normalized = normalized.Replace(" li pi ", " li XXXXZiXXXX "); } // "/\\*.*?\\*/" // Things that cross sentences should already be deal with earlier. if (normalized.ContainsCheck("/*") && normalized.ContainsCheck("*/")) { normalized = NormalizationTasks.ApplyNormalization(normalized, "Comments", NormalizationTasks.StripMultilineComments); } //Process explicit explicit Foreign text. (this always happens) if (normalized.ContainsCheck("\"")) { normalized = NormalizationTasks.ApplyNormalization(normalized, "ForeignSpace", NormalizationTasks.ProcessWhiteSpaceInForeignText, dialect); } //Process explict Foreign Text (this always happens) if (dialect.InferCompoundsPrepositionsForeignText) { normalized = NormalizationTasks.ApplyNormalization(normalized, "Foreign", NormalizeForeignText.NormalizeImplicit, dialect); } //Hyphenated words. This could cause a problem for compound words that cross lines. if (normalized.ContainsCheck("-\n")) { normalized = normalized.Replace("-\n", ""); } //can't cope with line breaks. if (normalized.ContainsCheck("\n")) { normalized = normalized.Replace("\n", " "); } if (normalized.ContainsCheck("\t")) { normalized = normalized.Replace("\t", " "); } //must be after - processing if (dialect.InferNumbers) { normalized = NormalizationTasks.ApplyNormalization(normalized, "Numbers", NormalizeNumbers.FindNumbers, dialect); } //Extraneous punctuation-- TODO, expand to most other symbols. if (normalized.ContainsCheck("(") || normalized.ContainsCheck(")")) { normalized = normalized.Replace("(", ""); normalized = normalized.Replace(")", ""); } //Extraneous commas if (normalized.ContainsCheck(",")) { //Benefit of the doubt. if you see , sama, ==> ~sama //Otherwise, assume it is garbage. foreach (string prep in Particles.Prepositions) { if (normalized.ContainsCheck("," + prep)) { normalized = normalized.Replace("," + prep, "~" + prep); } if (normalized.ContainsCheck(", " + prep)) { normalized = normalized.Replace(", " + prep, " ~" + prep); } } normalized = NormalizationTasks.ApplyNormalization(normalized, "ExtraCommas", NormalizationTasks.ProcessExtraneousCommas); } //Left overs from initial parsing. if (normalized.ContainsCheck("[NULL]")) { #if DEBUG throw new NormalizationException("Stop adding [NULL] to normalized sentences."); #else normalized = normalized.Replace("[NULL]", ""); #endif } //Normalize prepositions to ~, so that we don't have tokens with embedded spaces (e.g. foo, kepeken => [foo],[, kepeken]) if (normalized.ContainsCheck(" ")) { normalized = NormalizationTasks.ApplyNormalization(normalized, "ExtraWhiteSpace", NormalizationTasks.ProcessExtraneousWhiteSpace); } //Okay, phrases should be recognizable now. if (dialect.InferCompoundsPrepositionsForeignText) { normalized = NormalizationTasks.ApplyNormalization(normalized, "Compounds", cw.ProcessCompoundWords); } if (dialect.InferCompoundsPrepositionsForeignText) { normalized = NormalizationTasks.MarkImplicitPrepositions(text, normalized); } //la o //invisible implicit subject. if (normalized.ContainsCheck(" la o ")) { normalized = normalized.Replace(" la o ", " la jan Sanwan o "); } normalized = NormalizeMiSina.MiSinaProcessAndUndoOverNormalization(normalized); if (normalized.ContainsCheck("~")) { normalized = NormalizationTasks.ThoseArentPrepositions(normalized); } normalized = Regex.Replace(normalized, @"^\s+|\s+$", ""); //Remove extraneous whitespace //If it is a sentence fragment, I really can't deal with prep phrase that may or may not be in it. if (normalized.ContainsCheck("~") && !normalized.ContainsCheck(" li ") && //full sentence okay !normalized.StartCheck("o ") //imperative okay ) { normalized = normalized.Replace("~", ""); //HACK: This may erase ~ added by user at the start? } normalized = NormalizeMiSina.ProcessMiSinaOvernormalizationWithPrepositions(normalized); normalized = NormalizeMiSina.ProcessMiSinaOverNormalizationWithoutPrepositions(text, normalized); //One off that comes back? foreach (string oneOff in new[] { "li ~lon poka e", //place something next to "li ~tawa tu e" }) { normalized = normalized.Replace(oneOff, oneOff.Replace("~", "")); } if (normalized.ContainsCheck("'")) { normalized = NormalizationTasks.ApplyNormalization(normalized, "DirectQuotes", NormalizationTasks.AddDirectedQuotes); } //Post conditions. if (normalized.StartCheck("« »")) { throw new NormalizationException("quote recognition went wrong: " + text); } //Probably added above by mistake normalized = NormalizationTasks.RemoveInternalWhiteSpace(normalized); normalized = NormalizationTasks.TimeWhiteSpaceAndSpaceBeforeSentenceTerminators(normalized); sd = new SentenceDiagnostics(text, normalized); return(normalized); }
public string NormalizeText(string text) //= null { SentenceDiagnostics sd = new SentenceDiagnostics(text, "N/A"); //Nothing to parse. if (string.IsNullOrWhiteSpace(text) || NormalizationTasks.IsNullWhiteOrPunctuation(text)) { return(""); } //Don't normalize a comment. if (text.StartCheck("///") && !text.Contains("\n")) { return(text); } string normalized = NormalizationTasks.TimeWhiteSpaceAndSpaceBeforeSentenceTerminators(text); normalized = NormalizationTasks.RemoveInternalWhiteSpace(normalized); //Is this better early or later? if (normalized.Contains(@"""""")) { normalized = normalized.Replace(@"""""", @""""); } //Hide tokens that otherwise have a different meaning. if (normalized.ContainsCheck(" li pi ")) { normalized = normalized.Replace(" li pi ", " li XXXXZiXXXX "); } // "/\\*.*?\\*/" // Things that cross sentences should already be deal with earlier. if (normalized.ContainsCheck("/*") && normalized.ContainsCheck("*/")) { normalized = NormalizationTasks.ApplyNormalization(normalized, "Comments", NormalizationTasks.StripMultilineComments); } //Process explicit explicit Foreign text. (this always happens) if (normalized.ContainsCheck("\"")) { normalized = NormalizationTasks.ApplyNormalization(normalized, "ForeignSpace", NormalizationTasks.ProcessWhiteSpaceInForeignText, dialect); } //Swap terminators (always happens) normalized = NormalizationTasks.ApplyNormalization(normalized, "Foreign", NormalizeForeignText.NormalizeExplicit, dialect); //Hyphenated words. This could cause a problem for compound words that cross lines. if (normalized.ContainsCheck("-\n")) { normalized = normalized.Replace("-\n", ""); } //can't cope with line breaks. if (normalized.ContainsCheck("\n")) { normalized = normalized.Replace("\n", " "); } if (normalized.ContainsCheck("\t")) { normalized = normalized.Replace("\t", " "); } //must be after - processing //Don't infer numbers. //Extraneous punctuation-- TODO, expand to most other symbols. if (normalized.ContainsCheck("(") || normalized.ContainsCheck(")")) { normalized = normalized.Replace("(", ""); normalized = normalized.Replace(")", ""); } //Extraneous commas... not sure, we'd like some to go away, but we want ,lon ,sama etc to stay. //if (normalized.ContainsCheck(",")) //{ // normalized = NormalizationTasks.ApplyNormalization(normalized, "ExtraCommas", NormalizationTasks.ProcessExtraneousCommas); //} //Normalize prepositions to ~, so that we don't have tokens with embedded spaces (e.g. foo, kepeken => [foo],[, kepeken]) if (normalized.ContainsCheck(",")) { foreach (string prep in Particles.Prepositions) { if (normalized.ContainsCheck("," + prep)) { normalized = normalized.Replace("," + prep, " ~" + prep); } if (normalized.ContainsCheck(", " + prep)) { normalized = normalized.Replace(", " + prep, " ~" + prep); } } } if (normalized.ContainsCheck(" ")) { normalized = NormalizationTasks.ApplyNormalization(normalized, "ExtraWhiteSpace", NormalizationTasks.ProcessExtraneousWhiteSpace); } //Okay, phrases should be recognizable now. //Don't infer compound words //if (dialect.InferCompoundsPrepositionsForeignText) //{ // normalized = NormalizationTasks.MarkImplicitPrepositions(text, normalized); //} //la o //invisible implicit subject. if (normalized.ContainsCheck(" la o ")) { normalized = normalized.Replace(" la o ", " la jan Sanwan o "); } normalized = NormalizeMiSina.MiSinaProcessAndUndoOverNormalization(normalized); normalized = Regex.Replace(normalized, @"^\s+|\s+$", ""); //Remove extraneous whitespace normalized = NormalizeMiSina.ProcessMiSinaOvernormalizationWithPrepositions(normalized); normalized = NormalizeMiSina.ProcessMiSinaOverNormalizationWithoutPrepositions(text, normalized); if (normalized.ContainsCheck("'")) { normalized = NormalizationTasks.ApplyNormalization(normalized, "DirectQuotes", NormalizationTasks.AddDirectedQuotes); } //Probably added above by mistake normalized = NormalizationTasks.RemoveInternalWhiteSpace(normalized); normalized = NormalizationTasks.TimeWhiteSpaceAndSpaceBeforeSentenceTerminators(normalized); //Post conditions. if (normalized.StartCheck("« »")) { throw new NormalizationException("quote recognition went wrong: " + text); } sd = new SentenceDiagnostics(text, normalized); return(normalized); }