Exemple #1
0
        public SentSplitterModel(string sentSplitterResourcesXmlFilename)
        {
            var xdoc = XDocument.Load(sentSplitterResourcesXmlFilename);

            //-smiles-
            var smiles = from xe in xdoc.Root.Element("smiles").Elements()
                         select
                         new KeyValuePair <string, smile_t>
                         (
                xe.Value.Trim(),
                new smile_t(xe.AttrValueIsTrue("space-before"))
                         );

            Smiles = new dictionary_t <smile_t>(smiles.ToDictionary(false));

            //-interjections-
            var interjections = from xe in xdoc.Root.Element("interjections").Elements()
                                select
                                xe.Value.Trim().TrimEndDot();

            Interjections = new hashset_t(interjections.ToHashset(true));

            //-yandex-combinations-
            var yandexCombinations = from xe in xdoc.Root.Element("yandex-combinations").Elements()
                                     select
                                     xe.Value.Trim().TrimStartDot();

            YandexCombinations = new hashset_t(yandexCombinations.ToHashset(true));

            //-file-extensions-
            var fileExtensions = from xe in xdoc.Root.Element("file-extensions").Elements()
                                 select
                                 xe.Value.Trim().TrimStartDot();

            FileExtensions = new hashset_t(fileExtensions.ToHashset(true));

            //-before-no-proper-
            var beforeNoProper = (from xe in xdoc.Root.Element("before-no-proper").Elements()
                                  select
                                  xe.ToBeforeNoProper_ngrams()
                                  ).ToArray();

            BeforeNoProperSearcher = new AhoCorasick <before_no_proper_t>(beforeNoProper);

            //-before-proper-or-number-
            var beforeProperOrNumber = (from xe in xdoc.Root.Element("before-proper-or-number").Elements()
                                        select
                                        xe.ToBeforeProperOrNumber_ngrams()
                                        ).ToArray();

            BeforeProperOrNumberSearcher = new AhoCorasick <before_proper_or_number_t>(beforeProperOrNumber);

            var SENTCHARTYPE_MAP = InitializeSentPotentialEnds(Smiles, beforeNoProper, beforeProperOrNumber);

            //--//
            _SENTCHARTYPE_MAP_GCHandle = GCHandle.Alloc(SENTCHARTYPE_MAP, GCHandleType.Pinned);
            _SENTCHARTYPE_MAP          = (SentCharType *)_SENTCHARTYPE_MAP_GCHandle.AddrOfPinnedObject().ToPointer();

            xdoc = null;
        }
Exemple #2
0
        private byte[] InitializeSentPotentialEnds(
            dictionary_t <smile_t> smiles,
            ngram_t <before_no_proper_t>[]        beforeNoProper,
            ngram_t <before_proper_or_number_t>[] beforeProperOrNumber)
        {
            //---SENTCHARTYPE_MAP = new SentCharType[ char.MaxValue ];
            var SENTCHARTYPE_MAP = new byte[char.MaxValue];

            //-smile's-
            foreach (var c in smiles.Values.Keys.Select(k => k[0]))
            {
                SENTCHARTYPE_MAP[c] |= (byte)SentCharType.SmileBegin;
            }

            SENTCHARTYPE_MAP['!'] |= (byte)SentCharType.ExcludeInBracketAndQuote;
            SENTCHARTYPE_MAP['?'] |= (byte)SentCharType.ExcludeInBracketAndQuote;
            SENTCHARTYPE_MAP['…'] |= (byte)SentCharType.ExcludeInBracketAndQuote | (byte)SentCharType.ThreeDot;

            //-un-conditional-
            SENTCHARTYPE_MAP['\n'] = (byte)SentCharType.Unconditional;

            //-dot-
            SENTCHARTYPE_MAP['.'] = (byte)SentCharType.Dot;

            //-colon-
            //---SENTCHARTYPE_MAP[ ':' ] |= SentCharType.Colon;

            //-after ThreeDot allowed punctuation-
            SENTCHARTYPE_MAP[';'] |= (byte)SentCharType.AfterThreeDotAllowedPunctuation;
            SENTCHARTYPE_MAP[':'] |= (byte)SentCharType.AfterThreeDotAllowedPunctuation | (byte)SentCharType.AfterBracketAllowedPunctuation4QMEP;
            SENTCHARTYPE_MAP[','] |= (byte)SentCharType.AfterThreeDotAllowedPunctuation | (byte)SentCharType.AfterBracketAllowedPunctuation4QMEP;
            for (var c = char.MinValue; c < char.MaxValue; c++)
            {
                var ct = xlat.CHARTYPE_MAP[c];
                if ((ct & CharType.IsHyphen) == CharType.IsHyphen)    //if ( xlat.IsHyphen( c ) )
                {
                    SENTCHARTYPE_MAP[c] |= (byte)SentCharType.AfterThreeDotAllowedPunctuation | (byte)SentCharType.AfterBracketAllowedPunctuation4QMEP;
                }
                else
                if ((ct & CharType.IsQuote) == CharType.IsQuote)
                {
                    SENTCHARTYPE_MAP[c] |= (byte)SentCharType.AfterThreeDotAllowedPunctuation;
                }
            }

            //roman-digit
            SENTCHARTYPE_MAP['I'] |= (byte)SentCharType.RomanDigit;
            SENTCHARTYPE_MAP['V'] |= (byte)SentCharType.RomanDigit;
            SENTCHARTYPE_MAP['X'] |= (byte)SentCharType.RomanDigit;
            SENTCHARTYPE_MAP['C'] |= (byte)SentCharType.RomanDigit;
            SENTCHARTYPE_MAP['L'] |= (byte)SentCharType.RomanDigit;
            SENTCHARTYPE_MAP['M'] |= (byte)SentCharType.RomanDigit;

            foreach (var ngram in beforeProperOrNumber)
            {
                if (ngram.value.DigitsBefore)
                {
                    if (ngram.words.Length != 1 ||
                        ngram.words[0].Length != 2 ||
                        ngram.words[0][1] != '.'
                        )
                    {
                        throw (new ArgumentException("Value for <before-proper-or-number> items with attribute [ @digits-before='true' ] must be single word length of 2 with dot on end, wrong value: " + ngram));
                    }

                    /*
                     * var c = ngram.words[ 0 ][ 0 ];
                     * SENTCHARTYPE_MAP[ c ] |= SentCharType.BeforeProperOrNumberDigitsBeforeChar;
                     */
                }
            }


            UnstickFromDigits = new HashSet <string>();
            foreach (var ngram in beforeNoProper)
            {
                if (ngram.value.UnstickFromDigits)
                {
                    UnstickFromDigits.Add(ngram.words[0]);
                }
            }
            foreach (var ngram in beforeProperOrNumber)
            {
                if (ngram.value.UnstickFromDigits)
                {
                    UnstickFromDigits.Add(ngram.words[0]);
                }
            }

            return(SENTCHARTYPE_MAP);
        }