private static AhoCorasick CreateAbbreviationSearcher(string elementName, XDocument xdoc, string posTaggerResourcesXmlFilename) { const char DOT = '.'; var SPLIT_BY_DOT = new[] { DOT }; var words = (from xe in xdoc.Root.Element(elementName).Elements() let v = xe.Value.Trim() where !string.IsNullOrEmpty(v) select v ).ToArray(); var abbreviations = new List <string[]>(); foreach (var word in words) { var words_by_dot = word.Split(SPLIT_BY_DOT, StringSplitOptions.RemoveEmptyEntries); for (int i = 0, len = words_by_dot.Length; i < len; i++) { var w = words_by_dot[i].Trim(); if (string.IsNullOrEmpty(w)) { throw new InvalidDataException($"Wrong data in <abbreviation> section => [empty subitem], word: '{word}', pos-tagger-resources-xml-file: '{posTaggerResourcesXmlFilename}{'\''}"); } words_by_dot[i] = w + DOT; } var result = OpenAbbreviationPermutations(words_by_dot, true); abbreviations.AddRange(result); } var abbreviationSearcher = new AhoCorasick(abbreviations); return(abbreviationSearcher); }
public PosTaggerResourcesModel(string posTaggerResourcesXmlFilename) { var SPLIT_BY_SPACE = new[] { ' ' }; var xdoc = XDocument.Load(posTaggerResourcesXmlFilename); var numbers = from xe in xdoc.Root.Element("numbers").Elements() let v = xe.Value.Trim().ToUpperInvariant() where !string.IsNullOrEmpty(v) select v; Numbers = new HashSet <string>(numbers); var phrases = from xe in xdoc.Root.Element("phrases-rus").Elements() let v = xe.Value.Trim().ToUpperInvariant() where !string.IsNullOrEmpty(v) let words = v.Split(SPLIT_BY_SPACE, StringSplitOptions.RemoveEmptyEntries) select words; PhrasesSearcher = new AhoCorasick(phrases.ToList()); AbbreviationsSearcher = CreateAbbreviationSearcher("abbreviations", xdoc, posTaggerResourcesXmlFilename); var abbreviations = from xe in xdoc.Root.Element("abbreviations").Elements() let v = xe.Value.Trim().ToUpperInvariant().Replace(" ", string.Empty) where !string.IsNullOrEmpty(v) select v; Abbreviations = new HashSet <string>(abbreviations); }