public void WhyDidNormalizerStripOffTheDoubleQuotes() { //,CorpusTexts.JanSin //Too many neologisms to cope. string[] samples = { CorpusTexts.GeorgeSong }; Dialect dialect = Dialect.LooseyGoosey; dialect.TargetGloss = "en"; //ParserUtils pu = new ParserUtils(dialect); SentenceSplitter ss = new SentenceSplitter(dialect); Normalizer norm = new Normalizer(dialect); foreach (string sample in samples) { string[] sentences = ss.ParseIntoNonNormalizedSentences(sample); foreach (string sentence in sentences) { if (sentence.ContainsCheck("Georgia")) { string result = norm.NormalizeText(sentence); Assert.IsTrue(result.ContainsCheck("\"Georgia\"")); } } } }
public void SpellCheck_ForStressTest_UsingOnlyWordConstructor() { CorpusFileReader reader = new CorpusFileReader(); Dictionary <string, string> bad = new Dictionary <string, string>(); List <string> good = new List <string>(); TokenParserUtils tpu = new TokenParserUtils(); Dialect dialect = Dialect.LooseyGoosey; SentenceSplitter ss = new SentenceSplitter(dialect); Normalizer norm = new Normalizer(dialect); foreach (string s in reader.NextFile()) { string[] rawSentences = ss.ParseIntoNonNormalizedSentences(s); foreach (string sentence in rawSentences) { string normalized = norm.NormalizeText(sentence); //Normalization improved stuff string[] words = tpu.JustTokens(normalized); for (int index = 0; index < words.Length; index++) { //Don't remove double quotes or we can't ID some marked foreign text. //'"' words[index] = words[index].Trim(new[] { ':', '.', '\'', '«', '»', '!', '?', '-', '[', ']' }); } foreach (string word in words.Where(x => !string.IsNullOrEmpty(x))) { if (good.Contains(word)) { continue; } try { Word w = new Word(word); } catch (Exception ex) { if (bad.ContainsKey(word)) { bad[word] = (Convert.ToInt32(bad[word]) + 1).ToString(); //bad.Add(word, ex.Message); } else { bad.Add(word, "1"); } } } } } foreach (KeyValuePair <string, string> pair in bad) { if (Convert.ToInt32(pair.Value) > 10) { Console.WriteLine("Uh-oh: " + pair.Key + " " + pair.Value); } } }
public void Normalization_Explicit_IsIdempotent() { //Normalize implicit is not expected to be indempotent. int i = 0; Dialect dialect = Dialect.LooseyGoosey; NormalizeExplicit norm = new NormalizeExplicit(dialect); SentenceSplitter ss = new SentenceSplitter(dialect); CorpusFileReader reader = new CorpusFileReader(true); foreach (string s in reader.NextFile()) { if (reader.currentFile.ContainsCheck("janKipo")) { continue; } string[] sentenceStrings = ss.ParseIntoNonNormalizedSentences(s); foreach (string sentence in sentenceStrings) { string result1 = norm.NormalizeText(sentence); string result2 = norm.NormalizeText(result1); //Assert.AreEqual(result1,result2); if (result1 != result2) { Console.WriteLine("1: " + (result1 ?? "NULL")); Console.WriteLine("2: " + (result2 ?? "NULL")); } i++; } } Console.WriteLine("Sentences normalized: " + i); }
public void IdentifyDiscourses_ParseKnownGoodTexts_ShowGoodOnes() { //,CorpusTexts.JanSin //Too many neologisms to cope. string[] samples = { //CorpusTexts.UnpaText, //CorpusTexts.Gilgamesh, CorpusTexts.ProfesorAndMadMan, CorpusTexts.SampleText1, CorpusTexts.SampleText3, CorpusTexts.Lao, CorpusTexts.GeorgeSong, CorpusTexts.CrazyAnimal, CorpusTexts.CrazyAnimal2 , CorpusTexts.RuneDanceSong , CorpusTexts.janPusaRice , CorpusTexts.janPend }; Dialect dialect = Dialect.LooseyGoosey; dialect.TargetGloss = "en"; GlossMaker gm = new GlossMaker(); ParserUtils pu = new ParserUtils(dialect); int fail = 0; SentenceSplitter ss = new SentenceSplitter(dialect); Normalizer norm = new Normalizer(dialect); foreach (string sample in samples) { string[] sentenceStrings = ss.ParseIntoNonNormalizedSentences(sample); string[] normalized = new string[sentenceStrings.Length]; for (int index = 0; index < sentenceStrings.Length; index++) { //try //{ normalized[index] = norm.NormalizeText(sentenceStrings[index]); Sentence sentence = pu.ParsedSentenceFactory(normalized[index], sentenceStrings[index]); Console.WriteLine(sentence.ToString("g")); //} //catch (Exception ex) //{ // fail++; // Console.WriteLine(sentenceStrings[index]); // Console.WriteLine(ex); //} } Console.WriteLine(fail + " failed sentences."); } }
public void StressTestNormalize_AnuSeme() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); Dialect english = Dialect.LooseyGoosey; english.TargetGloss = "en"; english.GlossWithFallBacks = true; CorpusFileReader reader = new CorpusFileReader(true); GlossMaker gm = new GlossMaker(); SentenceSplitter ss = new SentenceSplitter(dialect); Normalizer norm = new Normalizer(dialect); foreach (string s in reader.NextFile()) { foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { //try //{ string normalized = norm.NormalizeText(original); if (!(normalized.ContainsWholeWord("anu seme"))) { continue; } i++; Sentence structured = pu.ParsedSentenceFactory(normalized, original); string diag = structured.ToString("b"); Console.WriteLine("O: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' })); Console.WriteLine("B: " + diag); Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english)); //} //catch (Exception ex) //{ // if (ex.Message.ContainsCheck("all tests")) // { // Console.WriteLine("ORIGINAL : " + original); // if (structured != null) // { // Console.WriteLine(structured.ToString("b")); // } // Console.WriteLine(ex.Message); // i++; // } // else throw; //} } } Console.WriteLine("Failed Sentences: " + i); }
private static void ProcessSerializationsModel(SerializationsModel parse) { Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); Normalizer norm = new Normalizer(dialect); SentenceSplitter ss = new SentenceSplitter(dialect); string[] sentences = ss.ParseIntoNonNormalizedSentences(parse.SourceText); StringBuilder errors = new StringBuilder(); List <Sentence> parseSentences = new List <Sentence>(); int i = 0; foreach (string sentence in sentences) { i++; if (i >= 3) { continue; } string normalized; try { normalized = norm.NormalizeText(sentence); } catch (Exception ex) { normalized = "[[CANNOT NORMALIZE: " + ex.Message + " for " + sentence + "]]"; errors.AppendLine(normalized + "<br/>"); continue; } Sentence parsedSentence; try { parsedSentence = pu.ParsedSentenceFactory(normalized, sentence); parseSentences.Add(parsedSentence); } catch (Exception ex) { string cantParse = "[[CANNOT PARSE: " + ex.Message.ToHtml() + " for " + sentence.ToHtml() + "]]"; errors.AppendLine(cantParse.ToHtml() + "<br/>"); } finally { dialect.TargetGloss = "tp"; } } parse.Json = parseSentences.ToJsonNet(); parse.Xml = FormatXml(parseSentences.ToDataContractXml()); parse.Html = "Not implemented yet"; }
public void StressTestNormalizeAndParseEverything() { int i = 0; int total = 0; Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); Dialect english = Dialect.LooseyGoosey; english.TargetGloss = "en"; english.GlossWithFallBacks = true; CorpusFileReader reader = new CorpusFileReader(true); GlossMaker gm = new GlossMaker(); SentenceSplitter ss = new SentenceSplitter(dialect); Normalizer norm = new Normalizer(dialect); Stopwatch watch = new Stopwatch(); watch.Start(); foreach (string s in reader.NextFile()) { if (reader.currentFile.ContainsCheck("janKipoCollected")) { continue; // Can't parse: *janMato 123 123 ni li musi! } foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { total++; //if (watch.ElapsedMilliseconds > 15000) return; //if (total > 1000) return; Sentence structured = null; try { string normalized = norm.NormalizeText(original); structured = pu.ParsedSentenceFactory(normalized, original); string diag = structured.ToString("b"); string gloss = gm.GlossOneSentence(false, structured, english); // Console.WriteLine("O: " + (original??"").Trim(new []{'\n','\r',' ','\t'})); // Console.WriteLine("B: " + diag); // Console.WriteLine("G: " + gloss); } catch (Exception ex) { i++; Console.WriteLine(SentenceDiagnostics.CurrentSentence.Original); Console.WriteLine(ex.Message); //else throw; } } } Console.WriteLine("Failed Sentences: " + i); }
public void QuotedSentence() { string s = @"""Saint Augustine Prophecy""."; Dialect dialect = Dialect.LooseyGoosey; SentenceSplitter ss = new SentenceSplitter(dialect); string[] results = ss.ParseIntoNonNormalizedSentences(s); Console.WriteLine(results[0]); Assert.IsTrue(!results[0].Contains("\"")); //Assert.IsTrue(results[0].StartCheck("\"")); }
public void StressTest_Parse_SpitBack_LooselyCompare() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); CorpusFileReader reader = new CorpusFileReader(true); Normalizer norm = new Normalizer(dialect); SentenceSplitter ss = new SentenceSplitter(dialect); int total = 0; int j = 0; foreach (string s in reader.NextFile()) { foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { if (original.StartCheck("*") && reader.currentFile.ContainsCheck("janKipoCollected")) // Can't parse: *janMato 123 123 ni li musi! { continue; } if (original.StartCheck("///")) //Don't care if commengs got corrupted. { continue; } total++; try { string normalized = norm.NormalizeText(original); Sentence structured = pu.ParsedSentenceFactory(normalized, original); string diag = structured.ToString(); if (!diag.TpLettersEqual(original)) { Console.WriteLine("O: " + original.Trim(new[] { ' ', '\t', '\n', '\r' }).Replace("\n", " ")); Console.WriteLine("G: " + diag); Console.WriteLine(" --- "); j++; } } catch (Exception) { i++; } } } Console.WriteLine("Total: " + total); Console.WriteLine("Mismatched: " + j); Console.WriteLine("Failed Sentences: " + i); }
private static void Execute(string s) { Dialect dialect = Dialect.LooseyGoosey; Normalizer norm = new Normalizer(dialect); ParserUtils pu = new ParserUtils(dialect); SentenceSplitter ss = new SentenceSplitter(dialect); foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { Console.WriteLine("----"); string normalized = norm.NormalizeText(original); Console.WriteLine(normalized); Sentence structured = pu.ParsedSentenceFactory(normalized, original); } }
public CorpusKnowledge(string corpus, Dialect dialect) { //https://stackoverflow.com/questions/521146/c-sharp-split-string-but-keep-split-chars-separators //https://stackoverflow.com/questions/3115150/how-to-escape-regular-expression-special-characters-using-javascript this.dialect = dialect; this.norm = new Normalizer(dialect); SentenceSplitter ss = new SentenceSplitter(dialect); sentences = ss.ParseIntoNonNormalizedSentences(corpus); for (int index = 0; index < sentences.Length; index++) { sentences[index] = norm.NormalizeText(sentences[index]); } }
public void IdentifyDiscourses_CanWeGroupThem() { Dialect dialect = Dialect.LooseyGoosey; Normalizer norm = new Normalizer(dialect); ParserUtils pu = new ParserUtils(dialect); SentenceSplitter ss = new SentenceSplitter(dialect); Sentence[] s = ss .ParseIntoNonNormalizedSentences(CorpusTexts.UnpaText) .Where(x => !string.IsNullOrWhiteSpace(x)) .Select(x => { string normalized = norm.NormalizeText(x); if (string.IsNullOrWhiteSpace(normalized)) { return(null); } return(pu.ParsedSentenceFactory(normalized, x)); }) .Where(x => x != null) .ToArray(); Assert.Greater(s.Length, 0); List <Sentence>[] d = ss.GroupIntoDiscourses(s); Assert.Greater(d.Length, 0); foreach (List <Sentence> discourse in d) { Assert.Greater(discourse.Count, 0); } Console.WriteLine("-------------------"); foreach (List <Sentence> discourse in d) { int i = 0; foreach (Sentence sentence in discourse) { i++; Console.WriteLine(i + ") " + sentence.ToString("b")); } Console.WriteLine("-------------------"); } }
public void SplitSentenceWithColon_Normalized() { const string s = "sina toki e ni: mi wile e ni."; Dialect dialect = Dialect.LooseyGoosey; Normalizer norm = new Normalizer(dialect); SentenceSplitter ss = new SentenceSplitter(dialect); string[] sentences = ss.ParseIntoNonNormalizedSentences(s); for (int index = 0; index < sentences.Length; index++) { sentences[index] = norm.NormalizeText(sentences[index]); } Assert.AreEqual("sina li toki e ni:", sentences[0]); Assert.AreEqual("mi li wile e ni.", sentences[1]); }
public void NormalizeAllTextFiles() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; Normalizer norm = new Normalizer(dialect); //ParserUtils pu = new ParserUtils(dialect); CorpusFileReader reader = new CorpusFileReader(); SentenceSplitter ss = new SentenceSplitter(dialect); foreach (string s in reader.NextFile()) { foreach (string sentence in ss.ParseIntoNonNormalizedSentences(s)) { string result = norm.NormalizeText(sentence); decimal percent = NormalizeForeignText.PercentTokiPona(result); Console.WriteLine(percent + "%"); i++; } } Console.WriteLine("Sentences normalized: " + i); }
public void StressTestParseThingsWithNumbers() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; dialect.InferCompoundsPrepositionsForeignText = false; dialect.InferNumbers = true; dialect.NumberType = "Body"; Normalizer norm = new Normalizer(dialect); CorpusFileReader reader = new CorpusFileReader(); SentenceSplitter ss = new SentenceSplitter(dialect); foreach (string s in reader.NextFile()) { foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { try { string normalized = norm.NormalizeText(original); string result = NormalizeNumbers.FindNumbers(normalized, dialect); if (result.ContainsCheck("#")) { Console.WriteLine("O: " + original); Console.WriteLine("N: " + normalized); Console.WriteLine("#: " + result); } } catch (Exception ex) { Console.WriteLine("ORIGINAL : " + original); Console.WriteLine(ex.Message); i++; } } } Console.WriteLine("Failed Sentences: " + i); }
public void StressTestMultipleLa() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); dialect.NumberType = "Stupid"; Dialect english = Dialect.LooseyGoosey; english.TargetGloss = "en"; english.GlossWithFallBacks = true; Normalizer norm = new Normalizer(dialect); CorpusFileReader reader = new CorpusFileReader(); GlossMaker gm = new GlossMaker(); SentenceSplitter ss = new SentenceSplitter(dialect); int total = 0; foreach (string s in reader.NextFile()) { foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { if (original.Contains(" su ")) { continue; //neologism, back when we didn't know what pu was and hoped it was something like scandinavian sem } if (original.Contains("o weka e jan Opama tan tomo walo")) { continue; //quoted text treated as a content word. } try { string normalized = norm.NormalizeText(original); //Multiple la's if (normalized.Split(new string[] { " la " }, StringSplitOptions.RemoveEmptyEntries).Length <= 2) { continue; } Sentence structured = pu.ParsedSentenceFactory(normalized, original); string diag = structured.ToString("b"); string spitBack = structured.ToString(); //if ((normalized.ContainsCheck("%ante"))) continue; //verb! Console.WriteLine("Org: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' })); Console.WriteLine("Rep: " + spitBack); Console.WriteLine("Brk: " + diag); //Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english)); total++; } catch (Exception ex) { Console.WriteLine("ORIGINAL : " + original); Console.WriteLine(ex.Message); i++; } } } Console.WriteLine("Total : " + total); Console.WriteLine("Failed Sentences: " + i); }
public void TransitionMatrix() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; dialect.InferCompoundsPrepositionsForeignText = false; dialect.InferNumbers = true; dialect.NumberType = "Body"; Normalizer norm = new Normalizer(dialect); CorpusFileReader reader = new CorpusFileReader(); SentenceSplitter ss = new SentenceSplitter(dialect); ParserUtils pu = new ParserUtils(dialect); Dictionary <string, Dictionary <string, int> > matrix = new Dictionary <string, Dictionary <string, int> >(125); foreach (KeyValuePair <string, Word> pair in Words.Dictionary) { if (Word.Deprecated.Contains(pair.Key)) { continue; } Dictionary <string, int> following = new Dictionary <string, int>(); foreach (KeyValuePair <string, Word> inner in Words.Dictionary) { if (!Word.Deprecated.Contains(inner.Key)) { following.Add(inner.Key, 0); } } matrix.Add(pair.Key, following); } foreach (string s in reader.NextFile()) { foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { try { string normalized = norm.NormalizeText(original); Sentence structured = pu.ParsedSentenceFactory(normalized, original); string restringed = structured.ToString("g"); List <string> parts = restringed.Split(new[] { ' ' }).ToList(); string last = null; string current = null; foreach (string part in parts) { current = part; if (last == null) { //Can't do anything yet. } else { if (matrix.ContainsKey(last)) { Dictionary <string, int> transitionScores = matrix[last]; if (transitionScores.ContainsKey(current)) { transitionScores[current]++; } } } last = current; } } catch (Exception ex) { Console.WriteLine("ORIGINAL : " + original); Console.WriteLine(ex.Message); i++; } } } Console.WriteLine("Failed Sentences: " + i); StringBuilder sb = new StringBuilder(); int header = 0; foreach (KeyValuePair <string, Dictionary <string, int> > pair in matrix) { if (header == 0) { header++; Console.Write("head\t"); foreach (var scores in pair.Value) { Console.Write(scores.Key + "\t"); } Console.WriteLine(); } Console.Write(pair.Key + "\t"); foreach (var scores in pair.Value) { if (scores.Value > 0) { Console.Write(scores.Value + "\t"); } else { Console.Write("\t"); } } Console.WriteLine(); } }
public void StressTestNormalize_Pronouns() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); Normalizer norm = new Normalizer(dialect); Dialect english = Dialect.LooseyGoosey; english.TargetGloss = "en"; english.GlossWithFallBacks = true; CorpusFileReader reader = new CorpusFileReader(); //GlossMaker gm = new GlossMaker(); SentenceSplitter ss = new SentenceSplitter(dialect); foreach (string s in reader.NextFile()) { foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { //try //{ string normalized = norm.NormalizeText(original); if (!(normalized.ContainsWholeWord("mi") || normalized.ContainsWholeWord("sina") || normalized.ContainsWholeWord("ona"))) { continue; } if (normalized.ContainsCheck("Kinla")) { continue; //Has a logical operator in one of the sample sentences that I can't deal with yet, unrelated to kin, ala } if (normalized.ContainsCheck("o,")) { continue; //Haven't dealt with vocatives yet. } if (normalized.ContainsCheck(" li pi ")) { continue; //Will deal with these when I feel like it. } if (normalized.ContainsCheck("ona li alasa pona")) { return; //Okay, this is some randome point in the middle. 100s is enough! } Sentence structured = pu.ParsedSentenceFactory(normalized, original); bool foundInteresting = false; if (structured.Subjects != null) { if (structured.Subjects.ComplexChains != null) { foreach (ComplexChain innerComplexChain in structured.Subjects.ComplexChains) { if (innerComplexChain.SubChains == null) { continue; } foreach (Chain subChain in innerComplexChain.SubChains) { if (subChain.HeadedPhrases == null) { continue; } foreach (HeadedPhrase headedPhrase in subChain.HeadedPhrases) { if (headedPhrase.Modifiers == null || headedPhrase.Modifiers.Count == 0) { continue; } if (headedPhrase.Head.Text == "mi" || headedPhrase.Head.Text == "sina" || headedPhrase.Head.Text == "ona") { Console.WriteLine("Found : " + headedPhrase); foundInteresting = true; } } } } } if (structured.Subjects.SubChains != null) { foreach (Chain subChain in structured.Subjects.SubChains) { if (subChain.HeadedPhrases == null) { continue; } foreach (HeadedPhrase headedPhrase in subChain.HeadedPhrases) { if (headedPhrase.Modifiers == null || headedPhrase.Modifiers.Count == 0) { continue; } if (headedPhrase.Head.Text == "mi" || headedPhrase.Head.Text == "sina" || headedPhrase.Head.Text == "ona") { Console.WriteLine("Found : " + headedPhrase); foundInteresting = true; } } } } } if (!foundInteresting) { continue; } string diag = structured.ToString("b"); //if ((normalized.ContainsCheck("%ante"))) continue; //verb! Console.WriteLine("O: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' })); Console.WriteLine("B: " + diag); //Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english)); //} //catch (Exception ex) //{ // if (ex.Message.ContainsCheck("all tests")) // { // Console.WriteLine("ORIGINAL : " + original); // if (structured != null) // { // Console.WriteLine(structured.ToString("b")); // } // Console.WriteLine(ex.Message); // i++; // } // else throw; //} } } Console.WriteLine("Failed Sentences: " + i); }
public void StressTestNormalize_VocativeImperatives() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); Dialect english = Dialect.LooseyGoosey; english.TargetGloss = "en"; english.GlossWithFallBacks = true; Normalizer norm = new Normalizer(dialect); CorpusFileReader reader = new CorpusFileReader(); //GlossMaker gm = new GlossMaker(); SentenceSplitter ss = new SentenceSplitter(dialect); foreach (string s in reader.NextFile()) { foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { Sentence structured = null; try { string normalized = norm.NormalizeText(original); if (string.IsNullOrWhiteSpace(normalized)) { continue; } if (!(normalized.ContainsWholeWord("o"))) { continue; } if (!(normalized.ContainsWholeWord("li"))) { continue; } if ((normalized.StartsWith("o "))) { continue; //These seem to be okay } if (normalized.ContainsCheck("Kinla")) { continue; //Has a logical operator in one of the sample sentences that I can't deal with yet, unrelated to kin, ala } structured = pu.ParsedSentenceFactory(normalized, original); string diag = structured.ToString("b"); Console.WriteLine("O: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' })); Console.WriteLine("B: " + diag); Console.WriteLine("..."); //Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english)); } catch (Exception ex) { Console.WriteLine("FAILED : " + original); if (structured != null) { Console.WriteLine(structured.ToString("b")); } Console.WriteLine(ex.Message); i++; } } } Console.WriteLine("Failed Sentences: " + i); Assert.AreEqual(0, i); }
private static void ProcessParserModelSentences(SimpleParserViewModel parse) { Dialect dialect = BindDialect(parse); ParserUtils pu = new ParserUtils(dialect); SentenceSplitter ss = new SentenceSplitter(dialect); Normalizer norm = new Normalizer(dialect); string[] sentences = ss.ParseIntoNonNormalizedSentences(parse.SourceText); StringBuilder normalizedSb = new StringBuilder(); StringBuilder spitBackSb = new StringBuilder(); StringBuilder bracketSb = new StringBuilder(); StringBuilder posSb = new StringBuilder(); StringBuilder glossSb = new StringBuilder(); StringBuilder errors = new StringBuilder(); StringBuilder colorized = new StringBuilder(); HtmlFormatter hf = new HtmlFormatter(); int i = 1; foreach (string sentence in sentences) { string lineNumber = LineNumber(i, true); string normalized; try { normalized = norm.NormalizeText(sentence); } catch (Exception ex) { string error = "[[CANNOT NORMALIZE: " + ex.Message + "]]"; normalizedSb.AppendLine(error.ToHtml() + "<br/>"); normalizedSb.AppendLine(hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); normalized = sentence; UpdateErrors(errors, error, sentence); } //////// TP normalizedSb.AppendLine(lineNumber + hf.BoldTheWords(normalized.ToHtml()) + "<br/>"); try { Sentence parsedSentence = pu.ParsedSentenceFactory(normalized, sentence); //////// TP try { spitBackSb.AppendLine(lineNumber + parsedSentence.ToString("g", dialect).ToHtml() + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT REPEAT BACK: " + ex.Message + " for " + sentence + "]]"; spitBackSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>"); spitBackSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); UpdateErrors(errors, error, sentence); } try { //string result = parsedSentence.ToString("html", dialect); //if (result.Replace("<span", "").Contains("<")) //{ // throw new InvalidOperationException("No HTML allowed in input"); //} colorized.AppendLine(lineNumber + parsedSentence.ToString("html", dialect) + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT COLORIZE: " + ex.Message + "]]"; spitBackSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>"); spitBackSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); UpdateErrors(errors, error, sentence); } //////// TP try { bracketSb.AppendLine(lineNumber + hf.BoldTheWords(parsedSentence.ToString("b", dialect).ToHtml()) + "<br/>"); } catch (Exception ex) { string error = "[[CANNOT BRACKET: " + ex.Message + " for " + sentence + "]]"; bracketSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>"); bracketSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); UpdateErrors(errors, error, sentence); } //////// ENGLISH try { dialect.TargetGloss = "en"; GlossMaker gm = new GlossMaker(); string glossed = gm.Gloss(normalized, sentence, "en", false); glossSb.AppendLine(lineNumber + glossed.ToHtml() + "<br/>"); glossed = gm.Gloss(normalized, sentence, "en", true); posSb.AppendLine(lineNumber + glossed.ToHtml() + "<br/>"); //bs doesn't do anything. } catch (Exception ex) { string error = "[[CANNOT GLOSS: " + ex.Message.ToHtml() + " for " + sentence.ToHtml() + "]]"; glossSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>"); glossSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); posSb.AppendLine(lineNumber + hf.BoldTheWords(error.ToHtml()) + "<br/>"); posSb.AppendLine(lineNumber + hf.BoldTheWords(sentence.ToHtml()) + "<br/>"); UpdateErrors(errors, error, sentence); } } catch (Exception ex) { string error = "[[CANNOT Parse: " + ex.Message.ToHtml() + "]]"; foreach (StringBuilder sb in new StringBuilder[] { //normalizedSb, spitBackSb, bracketSb, posSb, glossSb, colorized }) { sb.AppendLine(hf.BoldTheWords(error.ToHtml()) + "<br/>"); sb.Append(sentence.ToHtml() + "<br/>"); } UpdateErrors(errors, error, sentence); } finally { dialect.TargetGloss = "tp"; } i++; } parse.Normalized = normalizedSb.ToString(); parse.Recovered = spitBackSb.ToString(); parse.Formatted = bracketSb.ToString(); parse.FormattedPos = hf.SubThePartsOfSpeech(posSb.ToString()); parse.Glossed = glossSb.ToString(); parse.Colorized = colorized.ToString(); parse.Errors = errors.ToString(); }
public void StressTestNormalizeNotIndeedAlaKin() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; ParserUtils pu = new ParserUtils(dialect); Dialect english = Dialect.LooseyGoosey; english.TargetGloss = "en"; english.GlossWithFallBacks = true; Normalizer norm = new Normalizer(dialect); CorpusFileReader reader = new CorpusFileReader(); GlossMaker gm = new GlossMaker(); SentenceSplitter ss = new SentenceSplitter(dialect); foreach (string s in reader.NextFile()) { foreach (string original in ss.ParseIntoNonNormalizedSentences(s)) { if (original.Contains(" su ")) { continue; //neologism, back when we didn't know what pu was and hoped it was something like scandinavian sem } //try //{ string normalized = norm.NormalizeText(original); if (!(normalized.ContainsWholeWord("ala") || normalized.ContainsWholeWord("kin"))) { continue; } if (normalized.ContainsCheck("Kinla")) { continue; //Has a logical operator in one of the sample sentences that I can't deal with yet, unrelated to kin, ala } if (normalized.StartCheck("kin la ")) { continue; //no big deal } if (normalized.ContainsCheck("pilin pona o")) { continue; //Not trying to solve vocatives right now } if (normalized.ContainsCheck(" o, ")) { continue; //Not trying to solve vocatives right now } Sentence structured = pu.ParsedSentenceFactory(normalized, original); string diag = structured.ToString("b"); //if ((normalized.ContainsCheck("%ante"))) continue; //verb! Console.WriteLine("O: " + (original ?? "").Trim(new[] { '\n', '\r', ' ', '\t' })); Console.WriteLine("B: " + diag); Console.WriteLine("G: " + gm.GlossOneSentence(false, structured, english)); //} //catch (Exception ex) //{ // if (ex.Message.ContainsCheck("all tests")) // { // Console.WriteLine("ORIGINAL : " + original); // if (structured != null) // { // Console.WriteLine(structured.ToString("b")); // } // Console.WriteLine(ex.Message); // i++; // } // else throw; //} } } Console.WriteLine("Failed Sentences: " + i); }
public void FindProperModifiers() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; dialect.InferCompoundsPrepositionsForeignText = false; ParserUtils pu = new ParserUtils(dialect); Normalizer norm = new Normalizer(dialect); CorpusFileReader reader = new CorpusFileReader(); Dictionary <string, int> words = new Dictionary <string, int>(500); SentenceSplitter ss = new SentenceSplitter(dialect); foreach (string s in reader.NextFile()) { string[] sentences = ss.ParseIntoNonNormalizedSentences(s); foreach (string original in sentences) { try { string normalized = norm.NormalizeText(original); Sentence structured = pu.ParsedSentenceFactory(normalized, original); //string diag = structured.ToString("b"); string stringified = structured.Subjects.ToString(); if (!stringified.Contains(" ")) { continue; //single word } if (stringified.Contains(@"""")) { continue; //foreign } if (stringified.StartsWith(@"nanpa")) { continue; //implicit number } if (stringified.StartsWith(@"#")) { continue; //explicit number by punctuation } if (stringified.ContainsLetter(Token.AlphabetUpper)) { if (words.ContainsKey(stringified)) { words[stringified] = words[stringified] + 1; } else { words.Add(stringified, 1); Console.WriteLine(i + " : " + stringified); } } } catch (Exception) { i++; } } } foreach (KeyValuePair <string, int> pair in words.OrderBy(x => x.Value)) { Console.WriteLine(pair.Key + " : " + pair.Value); } }