public static bool Normalize(Segmenter segmenter, string segment, out string normalizedSegment, out FeatureSymbol leftEnv, out FeatureSymbol rightEnv) { normalizedSegment = null; if (string.IsNullOrEmpty(segment) || segment.IsOneOf("#", "C", "V")) { leftEnv = null; rightEnv = null; return false; } string strRep = StripContext(segment, out leftEnv, out rightEnv); if (strRep.IsOneOf("-", "_")) { normalizedSegment = "-"; return true; } string normalized; if (segmenter.NormalizeSegmentString(strRep, out normalized)) { normalizedSegment = normalized; return true; } leftEnv = null; rightEnv = null; return false; }
public void SetUp() { _segmenter = new Segmenter(_spanFactory) { Consonants = {"b", "c", "ch", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "sh", "t", "v", "w", "x", "z"}, Vowels = {"a", "e", "i", "o", "u"}, Boundaries = {"-"}, Modifiers = {"\u0303", "\u0308"}, Joiners = {"\u0361"} }; }
public ListSegmentMappings(Segmenter segmenter, IEnumerable<UnorderedTuple<string, string>> mappings, bool implicitComplexSegments) { _segmenter = segmenter; _mappings = mappings.ToList(); _implicitComplexSegments = implicitComplexSegments; _mappingLookup = new Dictionary<string, Dictionary<string, List<Tuple<Environment, Environment>>>>(); foreach (UnorderedTuple<string, string> mapping in _mappings) { FeatureSymbol leftEnv1, rightEnv1, leftEnv2, rightEnv2; string str1, str2; if (Normalize(_segmenter, mapping.Item1, out str1, out leftEnv1, out rightEnv1) && Normalize(_segmenter, mapping.Item2, out str2, out leftEnv2, out rightEnv2)) { var env1 = new Environment(leftEnv1, rightEnv1); var env2 = new Environment(leftEnv2, rightEnv2); Dictionary<string, List<Tuple<Environment, Environment>>> segments = _mappingLookup.GetValue(str1, () => new Dictionary<string, List<Tuple<Environment, Environment>>>()); List<Tuple<Environment, Environment>> contexts = segments.GetValue(str2, () => new List<Tuple<Environment, Environment>>()); contexts.Add(Tuple.Create(env1, env2)); segments = _mappingLookup.GetValue(str2, () => new Dictionary<string, List<Tuple<Environment, Environment>>>()); contexts = segments.GetValue(str1, () => new List<Tuple<Environment, Environment>>()); contexts.Add(Tuple.Create(env2, env1)); } } }
public void DoubleQuotationsAtTheEndOfASentence026() { var result = Segmenter.Segment("She turned to him, \"This is great.\" She held the book out to show him."); Assert.Equal(new[] { "She turned to him, \"This is great.\"", "She held the book out to show him." }, result); }
public void EllipsisAtEndOfQuotation046() { var result = Segmenter.Segment("Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"); Assert.Equal(new[] { "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”" }, result); }
public void EllipsisAsSentenceBoundaryStandardEllipsisRules048() { var result = Segmenter.Segment("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence."); Assert.Equal(new[] { "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence." }, result); }
public void GeoCoordinates043() { var result = Segmenter.Segment("You can find it at N°. 1026.253.553. That is where the treasure is."); Assert.Equal(new[] { "You can find it at N°. 1026.253.553.", "That is where the treasure is." }, result); }
public void OneLetterUpperCaseAbbreviations004() { var result = Segmenter.Segment("My name is Jonas E. Smith."); Assert.Equal(new[] { "My name is Jonas E. Smith." }, result); }
public void AlphabeticalList039() { var result = Segmenter.Segment("a. The first item b. The second item c. The third list item"); Assert.Equal(new[] { "a. The first item", "b. The second item", "c. The third list item" }, result); }
public void ErrantNewlinesInTheMiddleOfSentences041() { var result = Segmenter.Segment("It was a cold \nnight in the city."); Assert.Equal(new[] { "It was a cold night in the city." }, result); }
public void DoubleQuotationsInsideSentence025() { var result = Segmenter.Segment("She turned to him, \"This is great.\" she said."); Assert.Equal(new[] { "She turned to him, \"This is great.\" she said." }, result); }
public void NumberAsSentenceBoundary020() { var result = Segmenter.Segment("She has $100.00. It is in her bag."); Assert.Equal(new[] { "She has $100.00.", "It is in her bag." }, result); }
public void QuestionMarkToEndSentence002() { var result = Segmenter.Segment("What is your name? My name is Jonas."); Assert.Equal(new[] { "What is your name?", "My name is Jonas." }, result); }
public void SingleQuotationsInsideSentence024() { var result = Segmenter.Segment("She turned to him, 'This is great.' she said."); Assert.Equal(new[] { "She turned to him, 'This is great.' she said." }, result); }
public void WebAddresses023() { var result = Segmenter.Segment("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out."); Assert.Equal(new[] { "The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out." }, result); }
public void EmailAddresses022() { var result = Segmenter.Segment("Her email is [email protected]. I sent her an email."); Assert.Equal(new[] { "Her email is [email protected].", "I sent her an email." }, result); }
public void ParentheticalInsideSentence021() { var result = Segmenter.Segment("He teaches science (He previously worked for 5 years as an engineer.) at the local University."); Assert.Equal(new[] { "He teaches science (He previously worked for 5 years as an engineer.) at the local University." }, result); }
public void ListWithHyphen038() { var result = Segmenter.Segment("⁃9. The first item ⁃10. The second item"); Assert.Equal(new[] { "⁃9. The first item", "⁃10. The second item" }, result); }
public void DoublePunctuationExclamationPoint027() { var result = Segmenter.Segment("Hello!! Long time no see."); Assert.Equal(new[] { "Hello!!", "Long time no see." }, result); }
public void ErrantNewlinesInTheMiddleOfSentencesPdf040() { var result = Segmenter.Segment("This is a sentence\ncut off in the middle because pdf.", documentType: DocumentType.Pdf); Assert.Equal(new[] { "This is a sentence cut off in the middle because pdf." }, result); }
public void DoublePunctuationQuestionMark028() { var result = Segmenter.Segment("Hello?? Who is there?"); Assert.Equal(new[] { "Hello??", "Who is there?" }, result); }
public void LowerCaseListSeparatedByNewline042() { var result = Segmenter.Segment("features\ncontact manager\nevents, activities\n"); Assert.Equal(new[] { "features", "contact manager", "events, activities" }, result); }
public void DoublePunctuationExclamationPointQuestionMark029() { var result = Segmenter.Segment("Hello!? Is that you?"); Assert.Equal(new[] { "Hello!?", "Is that you?" }, result); }
public void NamedEntitiesWithAnExclamationPoint044() { var result = Segmenter.Segment("She works at Yahoo! in the accounting department."); Assert.Equal(new[] { "She works at Yahoo! in the accounting department." }, result); }
public void DoublePunctuationQuestionMarkExclamationPoint030() { var result = Segmenter.Segment("Hello?! Is that you?"); Assert.Equal(new[] { "Hello?!", "Is that you?" }, result); }
public void IasASentenceBoundaryAndIAsAnAbbreviation045() { var result = Segmenter.Segment("We make a good team, you and I. Did you see Albert I. Jones yesterday?"); Assert.Equal(new[] { "We make a good team, you and I.", "Did you see Albert I. Jones yesterday?" }, result); }
public void ListPeriodFollowedByParensAndPeriodToEndItem032() { var result = Segmenter.Segment("1.) The first item. 2.) The second item."); Assert.Equal(new[] { "1.) The first item.", "2.) The second item." }, result); }
public void EllipsisWithSquareBrackets047() { var result = Segmenter.Segment("\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55)."); Assert.Equal(new[] { "\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55)." }, result); }
public void ListParensAndPeriodToEndItem034() { var result = Segmenter.Segment("1) The first item. 2) The second item."); Assert.Equal(new[] { "1) The first item.", "2) The second item." }, result); }
public void ExclamationMarkToEndSentence003() { var result = Segmenter.Segment("There it is! I found it."); Assert.Equal(new[] { "There it is!", "I found it." }, result); }
public void ListPeriodToMarkListAndPeriodToEndItem036() { var result = Segmenter.Segment("1. The first item. 2. The second item."); Assert.Equal(new[] { "1. The first item.", "2. The second item." }, result); }
public static bool IsValid(Segmenter segmenter, string segment) { string normalizedSegment; FeatureSymbol leftEnv, rightEnv; return Normalize(segmenter, segment, out normalizedSegment, out leftEnv, out rightEnv); }
public void ListWithBullet037() { var result = Segmenter.Segment("• 9. The first item • 10. The second item"); Assert.Equal(new[] { "• 9. The first item", "• 10. The second item" }, result); }
public void SetUp() { _featSys = new FeatureSystem { new SymbolicFeature("place", new FeatureSymbol("bilabial"), new FeatureSymbol("labiodental"), new FeatureSymbol("dental"), new FeatureSymbol("alveolar"), new FeatureSymbol("retroflex"), new FeatureSymbol("palato-alveolar"), new FeatureSymbol("palatal"), new FeatureSymbol("velar"), new FeatureSymbol("uvular"), new FeatureSymbol("pharyngeal"), new FeatureSymbol("glottal")), new SymbolicFeature("manner", new FeatureSymbol("stop"), new FeatureSymbol("affricate"), new FeatureSymbol("fricative"), new FeatureSymbol("approximant"), new FeatureSymbol("trill"), new FeatureSymbol("flap"), new FeatureSymbol("close-vowel"), new FeatureSymbol("mid-vowel"), new FeatureSymbol("open-vowel")), new SymbolicFeature("voice", new FeatureSymbol("voice+"), new FeatureSymbol("voice-")), new SymbolicFeature("height", new FeatureSymbol("close"), new FeatureSymbol("near-close"), new FeatureSymbol("close-mid"), new FeatureSymbol("mid"), new FeatureSymbol("open-mid"), new FeatureSymbol("near-open"), new FeatureSymbol("open")), new SymbolicFeature("backness", new FeatureSymbol("front"), new FeatureSymbol("near-front"), new FeatureSymbol("central"), new FeatureSymbol("near-back"), new FeatureSymbol("back")), new SymbolicFeature("round", new FeatureSymbol("round+"), new FeatureSymbol("round-")) }; _segmentPool = new SegmentPool(); _segmenter = new Segmenter(_spanFactory) { Consonants = { {"c", FeatureStruct.New(_featSys).Symbol("palatal").Symbol("stop").Symbol("voice-").Value}, {"b", FeatureStruct.New(_featSys).Symbol("bilabial").Symbol("stop").Symbol("voice+").Value}, {"r", FeatureStruct.New(_featSys).Symbol("alveolar").Symbol("trill").Symbol("voice+").Value} }, Vowels = { {"a", FeatureStruct.New(_featSys).Symbol("open").Symbol("front").Symbol("round-").Symbol("open-vowel").Symbol("voice+").Value} }, Boundaries = {"-"}, Modifiers = {"\u0303", "\u0308"}, Joiners = {"\u0361"} }; var syllabifier = new SimpleSyllabifier(false, false); var meaning = new Meaning("test", null); var v1 = new Variety("variety1"); _word1 = new Word("car", meaning); _segmenter.Segment(_word1); v1.Words.Add(_word1); syllabifier.Process(v1); var v2 = new Variety("variety2"); _word2 = new Word("bar", meaning); _segmenter.Segment(_word2); v2.Words.Add(_word2); syllabifier.Process(v2); var vp = new VarietyPair(v1, v2); vp.SoundChangeFrequencyDistribution = new ConditionalFrequencyDistribution<SoundContext, Ngram<Segment>>(); vp.SoundChangeFrequencyDistribution[_word1.Shape.First.ToSoundContext(_segmentPool, Enumerable.Empty<SoundClass>())].Increment(_segmentPool.Get(_word2.Shape.First)); vp.SoundChangeProbabilityDistribution = new ConditionalProbabilityDistribution<SoundContext, Ngram<Segment>>(vp.SoundChangeFrequencyDistribution, (sc, fd) => new MaxLikelihoodProbabilityDistribution<Ngram<Segment>>(fd)); v1.VarietyPairs.VarietyPairAdded(vp); v2.VarietyPairs.VarietyPairAdded(vp); }
public void NumberAsNonSentenceBoundary019() { var result = Segmenter.Segment("She has $100.00 in her bag."); Assert.Equal(new[] { "She has $100.00 in her bag." }, result); }