public SentSplitterModel(string sentSplitterResourcesXmlFilename) { var xdoc = XDocument.Load(sentSplitterResourcesXmlFilename); //-smiles- var smiles = from xe in xdoc.Root.Element("smiles").Elements() select new KeyValuePair <string, smile_t> ( xe.Value.Trim(), new smile_t(xe.AttrValueIsTrue("space-before")) ); Smiles = new dictionary_t <smile_t>(smiles.ToDictionary(false)); //-interjections- var interjections = from xe in xdoc.Root.Element("interjections").Elements() select xe.Value.Trim().TrimEndDot(); Interjections = new hashset_t(interjections.ToHashset(true)); //-yandex-combinations- var yandexCombinations = from xe in xdoc.Root.Element("yandex-combinations").Elements() select xe.Value.Trim().TrimStartDot(); YandexCombinations = new hashset_t(yandexCombinations.ToHashset(true)); //-file-extensions- var fileExtensions = from xe in xdoc.Root.Element("file-extensions").Elements() select xe.Value.Trim().TrimStartDot(); FileExtensions = new hashset_t(fileExtensions.ToHashset(true)); //-before-no-proper- var beforeNoProper = (from xe in xdoc.Root.Element("before-no-proper").Elements() select xe.ToBeforeNoProper_ngrams() ).ToArray(); BeforeNoProperSearcher = new AhoCorasick <before_no_proper_t>(beforeNoProper); //-before-proper-or-number- var beforeProperOrNumber = (from xe in xdoc.Root.Element("before-proper-or-number").Elements() select xe.ToBeforeProperOrNumber_ngrams() ).ToArray(); BeforeProperOrNumberSearcher = new AhoCorasick <before_proper_or_number_t>(beforeProperOrNumber); var SENTCHARTYPE_MAP = InitializeSentPotentialEnds(Smiles, beforeNoProper, beforeProperOrNumber); //--// _SENTCHARTYPE_MAP_GCHandle = GCHandle.Alloc(SENTCHARTYPE_MAP, GCHandleType.Pinned); _SENTCHARTYPE_MAP = (SentCharType *)_SENTCHARTYPE_MAP_GCHandle.AddrOfPinnedObject().ToPointer(); xdoc = null; }
private byte[] InitializeSentPotentialEnds( dictionary_t <smile_t> smiles, ngram_t <before_no_proper_t>[] beforeNoProper, ngram_t <before_proper_or_number_t>[] beforeProperOrNumber) { //---SENTCHARTYPE_MAP = new SentCharType[ char.MaxValue ]; var SENTCHARTYPE_MAP = new byte[char.MaxValue]; //-smile's- foreach (var c in smiles.Values.Keys.Select(k => k[0])) { SENTCHARTYPE_MAP[c] |= (byte)SentCharType.SmileBegin; } SENTCHARTYPE_MAP['!'] |= (byte)SentCharType.ExcludeInBracketAndQuote; SENTCHARTYPE_MAP['?'] |= (byte)SentCharType.ExcludeInBracketAndQuote; SENTCHARTYPE_MAP['…'] |= (byte)SentCharType.ExcludeInBracketAndQuote | (byte)SentCharType.ThreeDot; //-un-conditional- SENTCHARTYPE_MAP['\n'] = (byte)SentCharType.Unconditional; //-dot- SENTCHARTYPE_MAP['.'] = (byte)SentCharType.Dot; //-colon- //---SENTCHARTYPE_MAP[ ':' ] |= SentCharType.Colon; //-after ThreeDot allowed punctuation- SENTCHARTYPE_MAP[';'] |= (byte)SentCharType.AfterThreeDotAllowedPunctuation; SENTCHARTYPE_MAP[':'] |= (byte)SentCharType.AfterThreeDotAllowedPunctuation | (byte)SentCharType.AfterBracketAllowedPunctuation4QMEP; SENTCHARTYPE_MAP[','] |= (byte)SentCharType.AfterThreeDotAllowedPunctuation | (byte)SentCharType.AfterBracketAllowedPunctuation4QMEP; for (var c = char.MinValue; c < char.MaxValue; c++) { var ct = xlat.CHARTYPE_MAP[c]; if ((ct & CharType.IsHyphen) == CharType.IsHyphen) //if ( xlat.IsHyphen( c ) ) { SENTCHARTYPE_MAP[c] |= (byte)SentCharType.AfterThreeDotAllowedPunctuation | (byte)SentCharType.AfterBracketAllowedPunctuation4QMEP; } else if ((ct & CharType.IsQuote) == CharType.IsQuote) { SENTCHARTYPE_MAP[c] |= (byte)SentCharType.AfterThreeDotAllowedPunctuation; } } //roman-digit SENTCHARTYPE_MAP['I'] |= (byte)SentCharType.RomanDigit; SENTCHARTYPE_MAP['V'] |= (byte)SentCharType.RomanDigit; SENTCHARTYPE_MAP['X'] |= (byte)SentCharType.RomanDigit; SENTCHARTYPE_MAP['C'] |= (byte)SentCharType.RomanDigit; SENTCHARTYPE_MAP['L'] |= (byte)SentCharType.RomanDigit; SENTCHARTYPE_MAP['M'] |= (byte)SentCharType.RomanDigit; foreach (var ngram in beforeProperOrNumber) { if (ngram.value.DigitsBefore) { if (ngram.words.Length != 1 || ngram.words[0].Length != 2 || ngram.words[0][1] != '.' ) { throw (new ArgumentException("Value for <before-proper-or-number> items with attribute [ @digits-before='true' ] must be single word length of 2 with dot on end, wrong value: " + ngram)); } /* * var c = ngram.words[ 0 ][ 0 ]; * SENTCHARTYPE_MAP[ c ] |= SentCharType.BeforeProperOrNumberDigitsBeforeChar; */ } } UnstickFromDigits = new HashSet <string>(); foreach (var ngram in beforeNoProper) { if (ngram.value.UnstickFromDigits) { UnstickFromDigits.Add(ngram.words[0]); } } foreach (var ngram in beforeProperOrNumber) { if (ngram.value.UnstickFromDigits) { UnstickFromDigits.Add(ngram.words[0]); } } return(SENTCHARTYPE_MAP); }