Exemplo n.º 1
0
        public static bool Normalize(Segmenter segmenter, string segment, out string normalizedSegment, out FeatureSymbol leftEnv, out FeatureSymbol rightEnv)
        {
            normalizedSegment = null;
            if (string.IsNullOrEmpty(segment) || segment.IsOneOf("#", "C", "V"))
            {
                leftEnv = null;
                rightEnv = null;
                return false;
            }

            string strRep = StripContext(segment, out leftEnv, out rightEnv);
            if (strRep.IsOneOf("-", "_"))
            {
                normalizedSegment = "-";
                return true;
            }
            string normalized;
            if (segmenter.NormalizeSegmentString(strRep, out normalized))
            {
                normalizedSegment = normalized;
                return true;
            }

            leftEnv = null;
            rightEnv = null;
            return false;
        }
Exemplo n.º 2
0
 public void SetUp()
 {
     _segmenter = new Segmenter(_spanFactory)
         {
             Consonants = {"b", "c", "ch", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "sh", "t", "v", "w", "x", "z"},
             Vowels = {"a", "e", "i", "o", "u"},
             Boundaries = {"-"},
             Modifiers = {"\u0303", "\u0308"},
             Joiners = {"\u0361"}
         };
 }
Exemplo n.º 3
0
        public ListSegmentMappings(Segmenter segmenter, IEnumerable<UnorderedTuple<string, string>> mappings, bool implicitComplexSegments)
        {
            _segmenter = segmenter;
            _mappings = mappings.ToList();
            _implicitComplexSegments = implicitComplexSegments;

            _mappingLookup = new Dictionary<string, Dictionary<string, List<Tuple<Environment, Environment>>>>();
            foreach (UnorderedTuple<string, string> mapping in _mappings)
            {
                FeatureSymbol leftEnv1, rightEnv1, leftEnv2, rightEnv2;
                string str1, str2;
                if (Normalize(_segmenter, mapping.Item1, out str1, out leftEnv1, out rightEnv1) && Normalize(_segmenter, mapping.Item2, out str2, out leftEnv2, out rightEnv2))
                {
                    var env1 = new Environment(leftEnv1, rightEnv1);
                    var env2 = new Environment(leftEnv2, rightEnv2);
                    Dictionary<string, List<Tuple<Environment, Environment>>> segments = _mappingLookup.GetValue(str1, () => new Dictionary<string, List<Tuple<Environment, Environment>>>());
                    List<Tuple<Environment, Environment>> contexts = segments.GetValue(str2, () => new List<Tuple<Environment, Environment>>());
                    contexts.Add(Tuple.Create(env1, env2));
                    segments = _mappingLookup.GetValue(str2, () => new Dictionary<string, List<Tuple<Environment, Environment>>>());
                    contexts = segments.GetValue(str1, () => new List<Tuple<Environment, Environment>>());
                    contexts.Add(Tuple.Create(env2, env1));
                }
            }
        }
        public void DoubleQuotationsAtTheEndOfASentence026()
        {
            var result = Segmenter.Segment("She turned to him, \"This is great.\" She held the book out to show him.");

            Assert.Equal(new[] { "She turned to him, \"This is great.\"", "She held the book out to show him." }, result);
        }
        public void EllipsisAtEndOfQuotation046()
        {
            var result = Segmenter.Segment("Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”");

            Assert.Equal(new[] { "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”" }, result);
        }
        public void EllipsisAsSentenceBoundaryStandardEllipsisRules048()
        {
            var result = Segmenter.Segment("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.");

            Assert.Equal(new[] { "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence." }, result);
        }
        public void GeoCoordinates043()
        {
            var result = Segmenter.Segment("You can find it at N°. 1026.253.553. That is where the treasure is.");

            Assert.Equal(new[] { "You can find it at N°. 1026.253.553.", "That is where the treasure is." }, result);
        }
        public void OneLetterUpperCaseAbbreviations004()
        {
            var result = Segmenter.Segment("My name is Jonas E. Smith.");

            Assert.Equal(new[] { "My name is Jonas E. Smith." }, result);
        }
        public void AlphabeticalList039()
        {
            var result = Segmenter.Segment("a. The first item b. The second item c. The third list item");

            Assert.Equal(new[] { "a. The first item", "b. The second item", "c. The third list item" }, result);
        }
        public void ErrantNewlinesInTheMiddleOfSentences041()
        {
            var result = Segmenter.Segment("It was a cold \nnight in the city.");

            Assert.Equal(new[] { "It was a cold night in the city." }, result);
        }
        public void DoubleQuotationsInsideSentence025()
        {
            var result = Segmenter.Segment("She turned to him, \"This is great.\" she said.");

            Assert.Equal(new[] { "She turned to him, \"This is great.\" she said." }, result);
        }
        public void NumberAsSentenceBoundary020()
        {
            var result = Segmenter.Segment("She has $100.00. It is in her bag.");

            Assert.Equal(new[] { "She has $100.00.", "It is in her bag." }, result);
        }
        public void QuestionMarkToEndSentence002()
        {
            var result = Segmenter.Segment("What is your name? My name is Jonas.");

            Assert.Equal(new[] { "What is your name?", "My name is Jonas." }, result);
        }
        public void SingleQuotationsInsideSentence024()
        {
            var result = Segmenter.Segment("She turned to him, 'This is great.' she said.");

            Assert.Equal(new[] { "She turned to him, 'This is great.' she said." }, result);
        }
        public void WebAddresses023()
        {
            var result = Segmenter.Segment("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.");

            Assert.Equal(new[] { "The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out." }, result);
        }
        public void EmailAddresses022()
        {
            var result = Segmenter.Segment("Her email is [email protected]. I sent her an email.");

            Assert.Equal(new[] { "Her email is [email protected].", "I sent her an email." }, result);
        }
        public void ParentheticalInsideSentence021()
        {
            var result = Segmenter.Segment("He teaches science (He previously worked for 5 years as an engineer.) at the local University.");

            Assert.Equal(new[] { "He teaches science (He previously worked for 5 years as an engineer.) at the local University." }, result);
        }
        public void ListWithHyphen038()
        {
            var result = Segmenter.Segment("⁃9. The first item ⁃10. The second item");

            Assert.Equal(new[] { "⁃9. The first item", "⁃10. The second item" }, result);
        }
        public void DoublePunctuationExclamationPoint027()
        {
            var result = Segmenter.Segment("Hello!! Long time no see.");

            Assert.Equal(new[] { "Hello!!", "Long time no see." }, result);
        }
        public void ErrantNewlinesInTheMiddleOfSentencesPdf040()
        {
            var result = Segmenter.Segment("This is a sentence\ncut off in the middle because pdf.", documentType: DocumentType.Pdf);

            Assert.Equal(new[] { "This is a sentence cut off in the middle because pdf." }, result);
        }
        public void DoublePunctuationQuestionMark028()
        {
            var result = Segmenter.Segment("Hello?? Who is there?");

            Assert.Equal(new[] { "Hello??", "Who is there?" }, result);
        }
        public void LowerCaseListSeparatedByNewline042()
        {
            var result = Segmenter.Segment("features\ncontact manager\nevents, activities\n");

            Assert.Equal(new[] { "features", "contact manager", "events, activities" }, result);
        }
        public void DoublePunctuationExclamationPointQuestionMark029()
        {
            var result = Segmenter.Segment("Hello!? Is that you?");

            Assert.Equal(new[] { "Hello!?", "Is that you?" }, result);
        }
        public void NamedEntitiesWithAnExclamationPoint044()
        {
            var result = Segmenter.Segment("She works at Yahoo! in the accounting department.");

            Assert.Equal(new[] { "She works at Yahoo! in the accounting department." }, result);
        }
        public void DoublePunctuationQuestionMarkExclamationPoint030()
        {
            var result = Segmenter.Segment("Hello?! Is that you?");

            Assert.Equal(new[] { "Hello?!", "Is that you?" }, result);
        }
        public void IasASentenceBoundaryAndIAsAnAbbreviation045()
        {
            var result = Segmenter.Segment("We make a good team, you and I. Did you see Albert I. Jones yesterday?");

            Assert.Equal(new[] { "We make a good team, you and I.", "Did you see Albert I. Jones yesterday?" }, result);
        }
        public void ListPeriodFollowedByParensAndPeriodToEndItem032()
        {
            var result = Segmenter.Segment("1.) The first item. 2.) The second item.");

            Assert.Equal(new[] { "1.) The first item.", "2.) The second item." }, result);
        }
        public void EllipsisWithSquareBrackets047()
        {
            var result = Segmenter.Segment("\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55).");

            Assert.Equal(new[] { "\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55)." }, result);
        }
        public void ListParensAndPeriodToEndItem034()
        {
            var result = Segmenter.Segment("1) The first item. 2) The second item.");

            Assert.Equal(new[] { "1) The first item.", "2) The second item." }, result);
        }
        public void ExclamationMarkToEndSentence003()
        {
            var result = Segmenter.Segment("There it is! I found it.");

            Assert.Equal(new[] { "There it is!", "I found it." }, result);
        }
        public void ListPeriodToMarkListAndPeriodToEndItem036()
        {
            var result = Segmenter.Segment("1. The first item. 2. The second item.");

            Assert.Equal(new[] { "1. The first item.", "2. The second item." }, result);
        }
Exemplo n.º 32
0
 public static bool IsValid(Segmenter segmenter, string segment)
 {
     string normalizedSegment;
     FeatureSymbol leftEnv, rightEnv;
     return Normalize(segmenter, segment, out normalizedSegment, out leftEnv, out rightEnv);
 }
        public void ListWithBullet037()
        {
            var result = Segmenter.Segment("• 9. The first item • 10. The second item");

            Assert.Equal(new[] { "• 9. The first item", "• 10. The second item" }, result);
        }
Exemplo n.º 34
0
        public void SetUp()
        {
            _featSys = new FeatureSystem
            {
                new SymbolicFeature("place",
                    new FeatureSymbol("bilabial"),
                    new FeatureSymbol("labiodental"),
                    new FeatureSymbol("dental"),
                    new FeatureSymbol("alveolar"),
                    new FeatureSymbol("retroflex"),
                    new FeatureSymbol("palato-alveolar"),
                    new FeatureSymbol("palatal"),
                    new FeatureSymbol("velar"),
                    new FeatureSymbol("uvular"),
                    new FeatureSymbol("pharyngeal"),
                    new FeatureSymbol("glottal")),
                new SymbolicFeature("manner",
                    new FeatureSymbol("stop"),
                    new FeatureSymbol("affricate"),
                    new FeatureSymbol("fricative"),
                    new FeatureSymbol("approximant"),
                    new FeatureSymbol("trill"),
                    new FeatureSymbol("flap"),
                    new FeatureSymbol("close-vowel"),
                    new FeatureSymbol("mid-vowel"),
                    new FeatureSymbol("open-vowel")),
                new SymbolicFeature("voice",
                    new FeatureSymbol("voice+"),
                    new FeatureSymbol("voice-")),
                new SymbolicFeature("height",
                    new FeatureSymbol("close"),
                    new FeatureSymbol("near-close"),
                    new FeatureSymbol("close-mid"),
                    new FeatureSymbol("mid"),
                    new FeatureSymbol("open-mid"),
                    new FeatureSymbol("near-open"),
                    new FeatureSymbol("open")),
                new SymbolicFeature("backness",
                    new FeatureSymbol("front"),
                    new FeatureSymbol("near-front"),
                    new FeatureSymbol("central"),
                    new FeatureSymbol("near-back"),
                    new FeatureSymbol("back")),
                new SymbolicFeature("round",
                    new FeatureSymbol("round+"),
                    new FeatureSymbol("round-"))
            };

            _segmentPool = new SegmentPool();
            _segmenter = new Segmenter(_spanFactory)
                {
                    Consonants =
                    {
                        {"c", FeatureStruct.New(_featSys).Symbol("palatal").Symbol("stop").Symbol("voice-").Value},
                        {"b", FeatureStruct.New(_featSys).Symbol("bilabial").Symbol("stop").Symbol("voice+").Value},
                        {"r", FeatureStruct.New(_featSys).Symbol("alveolar").Symbol("trill").Symbol("voice+").Value}
                    },
                    Vowels =
                    {
                        {"a", FeatureStruct.New(_featSys).Symbol("open").Symbol("front").Symbol("round-").Symbol("open-vowel").Symbol("voice+").Value}
                    },
                    Boundaries = {"-"},
                    Modifiers = {"\u0303", "\u0308"},
                    Joiners = {"\u0361"}
                };

            var syllabifier = new SimpleSyllabifier(false, false);

            var meaning = new Meaning("test", null);
            var v1 = new Variety("variety1");
            _word1 = new Word("car", meaning);
            _segmenter.Segment(_word1);
            v1.Words.Add(_word1);

            syllabifier.Process(v1);

            var v2 = new Variety("variety2");
            _word2 = new Word("bar", meaning);
            _segmenter.Segment(_word2);
            v2.Words.Add(_word2);

            syllabifier.Process(v2);

            var vp = new VarietyPair(v1, v2);
            vp.SoundChangeFrequencyDistribution = new ConditionalFrequencyDistribution<SoundContext, Ngram<Segment>>();
            vp.SoundChangeFrequencyDistribution[_word1.Shape.First.ToSoundContext(_segmentPool, Enumerable.Empty<SoundClass>())].Increment(_segmentPool.Get(_word2.Shape.First));
            vp.SoundChangeProbabilityDistribution = new ConditionalProbabilityDistribution<SoundContext, Ngram<Segment>>(vp.SoundChangeFrequencyDistribution,
                (sc, fd) => new MaxLikelihoodProbabilityDistribution<Ngram<Segment>>(fd));
            v1.VarietyPairs.VarietyPairAdded(vp);
            v2.VarietyPairs.VarietyPairAdded(vp);
        }
        public void NumberAsNonSentenceBoundary019()
        {
            var result = Segmenter.Segment("She has $100.00 in her bag.");

            Assert.Equal(new[] { "She has $100.00 in her bag." }, result);
        }