public static string DetectEntireForeignSentence(string sentence, Dialect dialect) { decimal percentTokipona = PercentTokiPona(sentence); if (percentTokipona >= 0.20m) { //Quote on a per word basis. return(NormalizeForeignText.DetectIndividualForeignWords(sentence, dialect)); } int count = sentence.Split('"').Length - 1; if (count == 0) { //Quote the whole thing. if (sentence.ContainsCheck(" ")) { sentence = sentence.Replace(" ", "*"); } if (!sentence.StartCheck(@"""")) { sentence = @"""" + sentence; } if (!sentence.EndCheck(@"""") && !sentence.EndCheck("\".")) { sentence = sentence + @""""; } if (sentence.EndsWith("\".\"")) { Console.WriteLine(sentence); throw new NormalizationException("Ends with " + "\".\""); } } return(sentence); //if 25% or less tp, this is mostly foreign text. //if 75% or more tp, this is tp text with errors. //return ""; }
public void NormalizeAllTextFiles() { int i = 0; Dialect dialect = Dialect.LooseyGoosey; Normalizer norm = new Normalizer(dialect); //ParserUtils pu = new ParserUtils(dialect); CorpusFileReader reader = new CorpusFileReader(); SentenceSplitter ss = new SentenceSplitter(dialect); foreach (string s in reader.NextFile()) { foreach (string sentence in ss.ParseIntoNonNormalizedSentences(s)) { string result = norm.NormalizeText(sentence); decimal percent = NormalizeForeignText.PercentTokiPona(result); Console.WriteLine(percent + "%"); i++; } } Console.WriteLine("Sentences normalized: " + i); }