public void WordsNumbersPunctuationWithoutDash() { TokenParserUtils pu = new TokenParserUtils(); string[] w = pu.JustTpWordsNumbersPunctuation("1231231"); Assert.AreEqual(1, w.Length); }
public void DoubleBadProperModifer() { const string s = "jan MaliyA"; Dialect dialect = Dialect.LooseyGoosey; Normalizer norm = new Normalizer(dialect); Console.WriteLine(norm.NormalizeText(s)); TokenParserUtils pu = new TokenParserUtils(); Word[] words; try { words = pu.ValidWords(s); } catch (Exception) { return; } foreach (Word word in words) { Console.WriteLine(word); } Assert.Fail(); }
public void ParseEachWordInDictionary2() { bool canary = false; TokenParserUtils pu = new TokenParserUtils(); foreach (Word word in Words.Dictionary.Values) { string[] oneWord = pu.JustTpWordsNumbersPunctuation(word.Text); if (oneWord.Length == 0) { canary = true; Console.WriteLine(word.Text); } if (oneWord.Length == 1 && oneWord[0] != word.Text) { canary = true; Console.WriteLine(word.Text); } //Assert.AreEqual(1, oneWord.Length, word.Text); //Assert.AreEqual(word.Text, oneWord[0], word.Text); } if (canary) { Assert.Fail("Something failed to parse"); } }
public void ThreWords() { TokenParserUtils pu = new TokenParserUtils(); string[] w = pu.JustTpWords("jan lili lon"); Assert.AreEqual(3, w.Length); }
public void OneWord() { TokenParserUtils pu = new TokenParserUtils(); string[] w = pu.JustTpWords("jan"); Assert.AreEqual(1, w.Length); }
public void SpellCheck_ForStressTest_UsingOnlyWordConstructor() { CorpusFileReader reader = new CorpusFileReader(); Dictionary <string, string> bad = new Dictionary <string, string>(); List <string> good = new List <string>(); TokenParserUtils tpu = new TokenParserUtils(); Dialect dialect = Dialect.LooseyGoosey; SentenceSplitter ss = new SentenceSplitter(dialect); Normalizer norm = new Normalizer(dialect); foreach (string s in reader.NextFile()) { string[] rawSentences = ss.ParseIntoNonNormalizedSentences(s); foreach (string sentence in rawSentences) { string normalized = norm.NormalizeText(sentence); //Normalization improved stuff string[] words = tpu.JustTokens(normalized); for (int index = 0; index < words.Length; index++) { //Don't remove double quotes or we can't ID some marked foreign text. //'"' words[index] = words[index].Trim(new[] { ':', '.', '\'', '«', '»', '!', '?', '-', '[', ']' }); } foreach (string word in words.Where(x => !string.IsNullOrEmpty(x))) { if (good.Contains(word)) { continue; } try { Word w = new Word(word); } catch (Exception ex) { if (bad.ContainsKey(word)) { bad[word] = (Convert.ToInt32(bad[word]) + 1).ToString(); //bad.Add(word, ex.Message); } else { bad.Add(word, "1"); } } } } } foreach (KeyValuePair <string, string> pair in bad) { if (Convert.ToInt32(pair.Value) > 10) { Console.WriteLine("Uh-oh: " + pair.Key + " " + pair.Value); } } }
public void ParseLiAla() { TokenParserUtils pu = new TokenParserUtils(); string[] twoWords = pu.JustTpWordsNumbersPunctuation("li ala"); Assert.AreEqual(2, twoWords.Length); Assert.AreEqual("li", twoWords[0]); Assert.AreEqual("ala", twoWords[1]); }
public void GoDawgs() { //"Go*Dawgs!" TokenParserUtils tpu = new TokenParserUtils(); string[] oneForeinString = tpu.JustTpWords("\"Go*Dawgs!\""); Assert.AreEqual(1, oneForeinString.Length); }
public void ParseEachWordInDictionary() { TokenParserUtils pu = new TokenParserUtils(); foreach (Word word in Words.Dictionary.Values) { string[] oneWord = pu.JustTpWords(word.Text); Assert.AreEqual(1, oneWord.Length); Assert.AreEqual(word.Text, oneWord[0]); } }
public void WordsNumbersWithDash() { TokenParserUtils pu = new TokenParserUtils(); string[] w = pu.JustTpWords("123-1231"); foreach (string s in w) { Console.WriteLine(s); } Assert.AreEqual(1, w.Length); }
public void CompoundWordParsePrepPhraseAsCompound() { TokenParserUtils pu = new TokenParserUtils(); string[] oneCompound = pu.JustTpWordsNumbersPunctuation("lon-ma-pi-ike-ale"); oneCompound = pu.RemergeCompounds(oneCompound); foreach (string s in oneCompound) { Console.WriteLine(s); } Assert.AreEqual(1, oneCompound.Length); Assert.AreEqual("lon-ma-pi-ike-ale", oneCompound[0]); }
public void CompoundWordParseSimple() { TokenParserUtils pu = new TokenParserUtils(); string[] oneCompound = pu.JustTpWordsNumbersPunctuation("tomo-tawa-kon"); oneCompound = pu.RemergeCompounds(oneCompound); foreach (string s in oneCompound) { Console.WriteLine(s); } Assert.AreEqual(1, oneCompound.Length); Assert.AreEqual("tomo-tawa-kon", oneCompound[0]); }
public void WordsNumbersPunctuationWithDash() { string value = "123-1231"; TokenParserUtils pu = new TokenParserUtils(); string[] w = pu.JustTpWordsNumbersPunctuation(value); foreach (string s in w) { Console.WriteLine(s); } Assert.AreEqual(1, w.Length); Assert.AreEqual(value, w[0]); }
public void ShouldBeGoodKunpapa() { const string s = "jan Kunpapa"; Dialect dialect = Dialect.LooseyGoosey; Normalizer norm = new Normalizer(dialect); Console.WriteLine(norm.NormalizeText(s)); TokenParserUtils pu = new TokenParserUtils(); Word[] words = pu.ValidWords(s); foreach (Word word in words) { Console.WriteLine(word); } }
public void SpellCheck() { string text = CorpusTexts.JanSin; TokenParserUtils pu = new TokenParserUtils(); string[] words = pu.JustTpWordsNumbersPunctuation(text); foreach (string word in words) { try { Word w = new Word(word); } catch (Exception) { Console.WriteLine("Uh-oh: " + word); } } }