public void IdentifyDiscourses_ParseKnownGoodTexts_ShowGoodOnes() { //,CorpusTexts.JanSin //Too many neologisms to cope. string[] samples = { //CorpusTexts.UnpaText, //CorpusTexts.Gilgamesh, CorpusTexts.ProfesorAndMadMan, CorpusTexts.SampleText1, CorpusTexts.SampleText3, CorpusTexts.Lao, CorpusTexts.GeorgeSong, CorpusTexts.CrazyAnimal, CorpusTexts.CrazyAnimal2 , CorpusTexts.RuneDanceSong , CorpusTexts.janPusaRice , CorpusTexts.janPend }; Dialect dialect = Dialect.LooseyGoosey; dialect.TargetGloss = "en"; GlossMaker gm = new GlossMaker(); ParserUtils pu = new ParserUtils(dialect); int fail = 0; SentenceSplitter ss = new SentenceSplitter(dialect); Normalizer norm = new Normalizer(dialect); foreach (string sample in samples) { string[] sentenceStrings = ss.ParseIntoNonNormalizedSentences(sample); string[] normalized = new string[sentenceStrings.Length]; for (int index = 0; index < sentenceStrings.Length; index++) { //try //{ normalized[index] = norm.NormalizeText(sentenceStrings[index]); Sentence sentence = pu.ParsedSentenceFactory(normalized[index], sentenceStrings[index]); Console.WriteLine(sentence.ToString("g")); //} //catch (Exception ex) //{ // fail++; // Console.WriteLine(sentenceStrings[index]); // Console.WriteLine(ex); //} } Console.WriteLine(fail + " failed sentences."); } }
public void StressTestNormalize_AnuSeme() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); Dialect english = Dialect.LooseyGoosey; english.TargetGloss = "en"; english.GlossWithFallBacks = true; CorpusFileReader reader = new CorpusFileReader(true); GlossMaker gm = new GlossMaker(); SentenceSplitter ss = new SentenceSplitter(dialect); Normalizer norm = new Normalizer(dialect); foreach (string s in reader.NextFile()) { foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { //try //{ string normalized = norm.NormalizeText(original); if (!(normalized.ContainsWholeWord("anu seme"))) { continue; } i++; Sentence structured = pu.ParsedSentenceFactory(normalized, original); string diag = structured.ToString("b"); Console.WriteLine("O: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' })); Console.WriteLine("B: " + diag); Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english)); //} //catch (Exception ex) //{ // if (ex.Message.ContainsCheck("all tests")) // { // Console.WriteLine("ORIGINAL : " + original); // if (structured != null) // { // Console.WriteLine(structured.ToString("b")); // } // Console.WriteLine(ex.Message); // i++; // } // else throw; //} } } Console.WriteLine("Failed Sentences: " + i); }
public void StressTestNormalizeAndParseEverything() { int i = 0; int total = 0; Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); Dialect english = Dialect.LooseyGoosey; english.TargetGloss = "en"; english.GlossWithFallBacks = true; CorpusFileReader reader = new CorpusFileReader(true); GlossMaker gm = new GlossMaker(); SentenceSplitter ss = new SentenceSplitter(dialect); Normalizer norm = new Normalizer(dialect); Stopwatch watch = new Stopwatch(); watch.Start(); foreach (string s in reader.NextFile()) { if (reader.currentFile.ContainsCheck("janKipoCollected")) { continue; // Can't parse: *janMato 123 123 ni li musi! } foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { total++; //if (watch.ElapsedMilliseconds > 15000) return; //if (total > 1000) return; Sentence structured = null; try { string normalized = norm.NormalizeText(original); structured = pu.ParsedSentenceFactory(normalized, original); string diag = structured.ToString("b"); string gloss = gm.GlossOneSentence(false, structured, english); // Console.WriteLine("O: " + (original??"").Trim(new []{'\n','\r',' ','\t'})); // Console.WriteLine("B: " + diag); // Console.WriteLine("G: " + gloss); } catch (Exception ex) { i++; Console.WriteLine(SentenceDiagnostics.CurrentSentence.Original); Console.WriteLine(ex.Message); //else throw; } } } Console.WriteLine("Failed Sentences: " + i); }
public void GenerateObjectAndStringifyParseGloss() { //Important! create teh dialect only once, mixing this is bad biz. Dialect dialect = Dialect.LooseyGoosey; dialect.IncludeApocrypha = false; List <Sentence> sentences = new List <Sentence>(); TextGenerator tg = new TextGenerator(dialect); for (int i = 0; i < 1000; i++) { sentences.Add(tg.GenerateSentence()); } ParserUtils pu = new ParserUtils(dialect); GlossMaker gm = new GlossMaker(); foreach (Sentence sentence in sentences) { string s = sentence.ToString(); NormalizeExplicit norm = new NormalizeExplicit(dialect); string sn = norm.NormalizeText(s); Console.WriteLine(sn); Console.WriteLine(sentence.ToString("b")); Console.WriteLine(gm.Gloss(sn, s, dialect)); Sentence reparsed = pu.ParsedSentenceFactory(sn, s); string reparseString = reparsed.ToString(); //string normalize = Normalizer.NormalizeText(reparseString, dialect); string normalize = norm.NormalizeText(reparseString); Console.WriteLine(normalize); Console.WriteLine(gm.Gloss(normalize, s, dialect)); } }
private static void ProcessParserModelSentences(SimpleParserViewModel parse) { Dialect dialect = BindDialect(parse); ParserUtils pu = new ParserUtils(dialect); SentenceSplitter ss = new SentenceSplitter(dialect); Normalizer norm = new Normalizer(dialect); string[] sentences = ss.ParseIntoNonNormalizedSentences(parse.SourceText); StringBuilder normalizedSb = new StringBuilder(); StringBuilder spitBackSb = new StringBuilder(); StringBuilder bracketSb = new StringBuilder(); StringBuilder posSb = new StringBuilder(); StringBuilder glossSb = new StringBuilder(); StringBuilder errors = new StringBuilder(); StringBuilder colorized = new StringBuilder(); HtmlFormatter hf = new HtmlFormatter(); int i = 1; foreach (string sentence in sentences) { string lineNumber = LineNumber(i, true); string normalized; try { normalized = norm.NormalizeText(sentence); } catch (Exception ex) { string error = "[[CANNOT NORMALIZE: " + ex.Message + "]]"; normalizedSb.AppendLine(error.ToHtml() + "<br/>"); normalizedSb.AppendLine(hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); normalized = sentence; UpdateErrors(errors, error, sentence); } //////// TP normalizedSb.AppendLine(lineNumber + hf.BoldTheWords(normalized.ToHtml()) + "<br/>"); try { Sentence parsedSentence = pu.ParsedSentenceFactory(normalized, sentence); //////// TP try { spitBackSb.AppendLine(lineNumber + parsedSentence.ToString("g", dialect).ToHtml() + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT REPEAT BACK: " + ex.Message + " for " + sentence + "]]"; spitBackSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>"); spitBackSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); UpdateErrors(errors, error, sentence); } try { //string result = parsedSentence.ToString("html", dialect); //if (result.Replace("<span", "").Contains("<")) //{ // throw new InvalidOperationException("No HTML allowed in input"); //} colorized.AppendLine(lineNumber + parsedSentence.ToString("html", dialect) + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT COLORIZE: " + ex.Message + "]]"; spitBackSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>"); spitBackSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); UpdateErrors(errors, error, sentence); } //////// TP try { bracketSb.AppendLine(lineNumber + hf.BoldTheWords(parsedSentence.ToString("b", dialect).ToHtml()) + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT BRACKET: " + ex.Message + " for " + sentence + "]]"; bracketSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>"); bracketSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); UpdateErrors(errors, error, sentence); } //////// ENGLISH try { dialect.TargetGloss = "en"; GlossMaker gm = new GlossMaker(); string glossed = gm.Gloss(normalized, sentence, "en", false); glossSb.AppendLine(lineNumber + glossed.ToHtml() + "<br/>"); glossed = gm.Gloss(normalized, sentence, "en", true); posSb.AppendLine(lineNumber + glossed.ToHtml() + "<br/>"); //bs doesn't do anything. } catch (Exception ex) { string error = "[[CANNOT GLOSS: " + ex.Message.ToHtml() + " for " + sentence.ToHtml() + "]]"; glossSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>"); glossSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); posSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>"); posSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); UpdateErrors(errors, error, sentence); } } catch (Exception ex) { string error = "[[CANNOT Parse: " + ex.Message.ToHtml() + "]]"; foreach (StringBuilder sb in new StringBuilder[] { //normalizedSb, spitBackSb, bracketSb, posSb, glossSb, colorized }) { sb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); sb.Append(sentence.ToHtml() + "<br/>"); } UpdateErrors(errors, error, sentence); } finally { dialect.TargetGloss = "tp"; } i++; } parse.Normalized = normalizedSb.ToString(); parse.Recovered = spitBackSb.ToString(); parse.Formatted = bracketSb.ToString(); parse.FormattedPos = hf.SubThePartsOfSpeech(posSb.ToString()); parse.Glossed = glossSb.ToString(); parse.Colorized = colorized.ToString(); parse.Errors = errors.ToString(); }
private static void ProcessParserModelParagraphs(SimpleParserViewModel parse) { Dialect dialect = BindDialect(parse); ParagraphSplitter ps = new ParagraphSplitter(dialect); StringBuilder normalizedSb = new StringBuilder(); StringBuilder spitBackSb = new StringBuilder(); StringBuilder bracketSb = new StringBuilder(); StringBuilder posSb = new StringBuilder(); StringBuilder glossSb = new StringBuilder(); StringBuilder errors = new StringBuilder(); StringBuilder colorized = new StringBuilder(); HtmlFormatter hf = new HtmlFormatter(); Prose prose; try { prose = ps.ParseProse(parse.SourceText); } catch (Exception ex) { //We CAN'T. ProcessParserModelSentences(parse); return; } foreach (Paragraph paragraph in prose.Paragraphs) { //////// TP try { spitBackSb.AppendLine(paragraph.ToString("g", dialect).ToHtml() + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT REPEAT BACK: " + ex.Message + "]]"; spitBackSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); //spitBackSb.AppendLine(hf.BoldTheWords(paragraph.ToHtml()) + "<br/>"); //UpdateErrors(errors, error, sentence); } try { //string result = parsedSentence.ToString("html", dialect); //if (result.Replace("<span", "").Contains("<")) //{ // throw new InvalidOperationException("No HTML allowed in input"); //} colorized.AppendLine(paragraph.ToString("html", dialect) + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT COLORIZE: " + ex.Message + "]]"; spitBackSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); //spitBackSb.AppendLine(hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); // //UpdateErrors(errors, error, sentence); } //////// TP try { bracketSb.AppendLine(hf.BoldTheWords(paragraph.ToString("b", dialect).ToHtml()) + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT BRACKET: " + ex.Message + "]]"; bracketSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); //bracketSb.AppendLine(hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); //UpdateErrors(errors, error, sentence); } //////// ENGLISH try { dialect.TargetGloss = "en"; GlossMaker gm = new GlossMaker(); string glossed = gm.GlossParagraph(paragraph, dialect); glossSb.AppendLine(glossed.ToHtml() + "<br/>"); glossed = gm.GlossParagraph(paragraph, dialect, true); posSb.AppendLine(glossed.ToHtml() + "<br/>"); //bs doesn't do anything. } catch (Exception ex) { string error = "[[CANNOT GLOSS: " + ex.Message.ToHtml() + "]]"; glossSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); //glossSb.AppendLine(hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); posSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); //posSb.AppendLine(hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); //UpdateErrors(errors, error, sentence); } //} //catch (Exception ex) //{ // string error = "[[CANNOT Parse: " + ex.Message.ToHtml() + "]]"; // foreach (StringBuilder sb in new StringBuilder[] { //normalizedSb, // spitBackSb, bracketSb, posSb, glossSb, colorized }) // { // sb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); // sb.Append(sentence.ToHtml() + "<br/>"); // } // UpdateErrors(errors, error, sentence); //} //finally //{ // dialect.TargetGloss = "tp"; //} } parse.Normalized = normalizedSb.ToString(); parse.Recovered = spitBackSb.ToString(); parse.Formatted = bracketSb.ToString(); parse.FormattedPos = hf.SubThePartsOfSpeech(posSb.ToString()); parse.Glossed = glossSb.ToString(); parse.Colorized = colorized.ToString(); parse.Errors = errors.ToString(); }
public ActionResult LoremIpsum() { Dialect dialect = Dialect.LooseyGoosey; dialect.IncludeApocrypha = false; TextGenerator tg = new TextGenerator(dialect); List <Sentence> sentences = new List <Sentence>(); for (int i = 0; i < 1000; i++) { Sentence s = tg.GenerateSentence(); sentences.Add(s); } StringBuilder spitBackSb = new StringBuilder(); StringBuilder bracketSb = new StringBuilder(); StringBuilder posSb = new StringBuilder(); StringBuilder glossSb = new StringBuilder(); StringBuilder colorized = new StringBuilder(); StringBuilder errors = new StringBuilder(); HtmlFormatter hf = new HtmlFormatter(); StringBuilder sb = new StringBuilder(); foreach (Sentence s in sentences) { Sentence parsedSentence = s; string normalized; //////// TP try { normalized = parsedSentence.ToString("g", dialect); spitBackSb.AppendLine(normalized.ToHtml() + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT REPEAT BACK: " + ex.Message + "]]"; spitBackSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); errors.Append(error); } try { string result = parsedSentence.ToString("html", dialect); //if (result.Replace("<span", "").Contains("<")) //{ // throw new InvalidOperationException("No HTML allowed in input"); //} colorized.AppendLine(result + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT COLORIZE: " + ex.Message + "]]"; spitBackSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); errors.Append(error); } //////// TP try { string diagrammed = parsedSentence.ToString("b", dialect); bracketSb.AppendLine(hf.BoldTheWords(diagrammed.ToHtml()) + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT BRACKET: " + ex.Message + "]]"; bracketSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); errors.Append(error); } //////// ENGLISH try { dialect.TargetGloss = "en"; GlossMaker gm = new GlossMaker(); string glossed = gm.GlossOneSentence(false, s, dialect); glossSb.AppendLine(glossed.ToHtml() + "<br/>"); glossed = gm.GlossOneSentence(true, s, dialect); posSb.AppendLine(glossed.ToHtml() + "<br/>"); //bs doesn't do anything. } catch (Exception ex) { string error = "[[CANNOT GLOSS: " + ex.Message.ToHtml() + "]]"; glossSb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); errors.Append(error); } sb.Append(colorized.ToString()); //sb.Append("<br/>"); sb.Append(bracketSb.ToString()); //sb.Append("<br/>"); sb.Append(glossSb.ToString()); sb.Append(hf.SubThePartsOfSpeech(posSb.ToString())); //sb.Append("<br/>"); sb.Append("<br/>"); spitBackSb.Clear(); bracketSb.Clear(); posSb.Clear(); glossSb.Clear(); colorized.Clear(); } return(View(new LoremIpsumModel { Html = sb.ToString() })); }
public void StressTestNormalizeNotIndeedAlaKin() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); Dialect english = Dialect.LooseyGoosey; english.TargetGloss = "en"; english.GlossWithFallBacks = true; Normalizer norm = new Normalizer(dialect); CorpusFileReader reader = new CorpusFileReader(); GlossMaker gm = new GlossMaker(); SentenceSplitter ss = new SentenceSplitter(dialect); foreach (string s in reader.NextFile()) { foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { if (original.Contains(" su ")) { continue; //neologism, back when we didn't know what pu was and hoped it was something like scandinavian sem } //try //{ string normalized = norm.NormalizeText(original); if (!(normalized.ContainsWholeWord("ala") || normalized.ContainsWholeWord("kin"))) { continue; } if (normalized.ContainsCheck("Kinla")) { continue; //Has a logical operator in one of the sample sentences that I can't deal with yet, unrelated to kin, ala } if (normalized.StartCheck("kin la ")) { continue; //no big deal } if (normalized.ContainsCheck("pilin pona o")) { continue; //Not trying to solve vocatives right now } if (normalized.ContainsCheck(" o, ")) { continue; //Not trying to solve vocatives right now } Sentence structured = pu.ParsedSentenceFactory(normalized, original); string diag = structured.ToString("b"); //if ((normalized.ContainsCheck("%ante"))) continue; //verb! Console.WriteLine("O: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' })); Console.WriteLine("B: " + diag); Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english)); //} //catch (Exception ex) //{ // if (ex.Message.ContainsCheck("all tests")) // { // Console.WriteLine("ORIGINAL : " + original); // if (structured != null) // { // Console.WriteLine(structured.ToString("b")); // } // Console.WriteLine(ex.Message); // i++; // } // else throw; //} } } Console.WriteLine("Failed Sentences: " + i); }
public void StressTestMultipleLa() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); dialect.NumberType = "Stupid"; Dialect english = Dialect.LooseyGoosey; english.TargetGloss = "en"; english.GlossWithFallBacks = true; Normalizer norm = new Normalizer(dialect); CorpusFileReader reader = new CorpusFileReader(); GlossMaker gm = new GlossMaker(); SentenceSplitter ss = new SentenceSplitter(dialect); int total = 0; foreach (string s in reader.NextFile()) { foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { if (original.Contains(" su ")) { continue; //neologism, back when we didn't know what pu was and hoped it was something like scandinavian sem } if (original.Contains("o weka e jan Opama tan tomo walo")) { continue; //quoted text treated as a content word. } try { string normalized = norm.NormalizeText(original); //Multiple la's if (normalized.Split(new string[] { " la " }, StringSplitOptions.RemoveEmptyEntries).Length <= 2) { continue; } Sentence structured = pu.ParsedSentenceFactory(normalized, original); string diag = structured.ToString("b"); string spitBack = structured.ToString(); //if ((normalized.ContainsCheck("%ante"))) continue; //verb! Console.WriteLine("Org: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' })); Console.WriteLine("Rep: " + spitBack); Console.WriteLine("Brk: " + diag); //Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english)); total++; } catch (Exception ex) { Console.WriteLine("ORIGINAL : " + original); Console.WriteLine(ex.Message); i++; } } } Console.WriteLine("Total : " + total); Console.WriteLine("Failed Sentences: " + i); }