Пример #1
0
        private static AhoCorasick CreateAbbreviationSearcher(string elementName, XDocument xdoc, string posTaggerResourcesXmlFilename)
        {
            const char DOT          = '.';
            var        SPLIT_BY_DOT = new[] { DOT };

            var words = (from xe in xdoc.Root.Element(elementName).Elements()
                         let v = xe.Value.Trim()
                                 where !string.IsNullOrEmpty(v)
                                 select v
                         ).ToArray();
            var abbreviations = new List <string[]>();

            foreach (var word in words)
            {
                var words_by_dot = word.Split(SPLIT_BY_DOT, StringSplitOptions.RemoveEmptyEntries);
                for (int i = 0, len = words_by_dot.Length; i < len; i++)
                {
                    var w = words_by_dot[i].Trim();
                    if (string.IsNullOrEmpty(w))
                    {
                        throw new InvalidDataException($"Wrong data in <abbreviation> section => [empty subitem], word: '{word}', pos-tagger-resources-xml-file: '{posTaggerResourcesXmlFilename}{'\''}");
                    }

                    words_by_dot[i] = w + DOT;
                }

                var result = OpenAbbreviationPermutations(words_by_dot, true);

                abbreviations.AddRange(result);
            }

            var abbreviationSearcher = new AhoCorasick(abbreviations);

            return(abbreviationSearcher);
        }
Пример #2
0
        public PosTaggerResourcesModel(string posTaggerResourcesXmlFilename)
        {
            var SPLIT_BY_SPACE = new[] { ' ' };
            var xdoc           = XDocument.Load(posTaggerResourcesXmlFilename);

            var numbers = from xe in xdoc.Root.Element("numbers").Elements()
                          let v = xe.Value.Trim().ToUpperInvariant()
                                  where !string.IsNullOrEmpty(v)
                                  select v;

            Numbers = new HashSet <string>(numbers);

            var phrases = from xe in xdoc.Root.Element("phrases-rus").Elements()
                          let v = xe.Value.Trim().ToUpperInvariant()
                                  where !string.IsNullOrEmpty(v)
                                  let words = v.Split(SPLIT_BY_SPACE, StringSplitOptions.RemoveEmptyEntries)
                                              select words;

            PhrasesSearcher = new AhoCorasick(phrases.ToList());

            AbbreviationsSearcher = CreateAbbreviationSearcher("abbreviations", xdoc, posTaggerResourcesXmlFilename);

            var abbreviations = from xe in xdoc.Root.Element("abbreviations").Elements()
                                let v = xe.Value.Trim().ToUpperInvariant().Replace(" ", string.Empty)
                                        where !string.IsNullOrEmpty(v)
                                        select v;

            Abbreviations = new HashSet <string>(abbreviations);
        }