public PosTaggerResourcesModel(string posTaggerResourcesXmlFilename) { var SPLIT_BY_SPACE = new[] { ' ' }; var xdoc = XDocument.Load(posTaggerResourcesXmlFilename); //-1- var numbers = from xe in xdoc.Root.Element("numbers").Elements() let v = xe.Value.Trim().ToUpperInvariant() where !string.IsNullOrEmpty(v) select v; Numbers = new HashSet <string>(numbers); //-2- var phrases = from xe in xdoc.Root.Element("phrases-rus").Elements() let v = xe.Value.Trim().ToUpperInvariant() where !string.IsNullOrEmpty(v) let words = v.Split(SPLIT_BY_SPACE, StringSplitOptions.RemoveEmptyEntries) select words; PhrasesSearcher = new AhoCorasick(phrases.ToList()); //-3- AbbreviationsSearcher = CreateAbbreviationSearcher("abbreviations", xdoc, posTaggerResourcesXmlFilename); //-4- var abbreviations = from xe in xdoc.Root.Element("abbreviations").Elements() let v = xe.Value.Trim().ToUpperInvariant().Replace(" ", string.Empty) where !string.IsNullOrEmpty(v) select v; Abbreviations = new HashSet <string>(abbreviations); }
private static AhoCorasick CreateAbbreviationSearcher( string elementName, XDocument xdoc, string posTaggerResourcesXmlFilename) { const char DOT = '.'; var SPLIT_BY_DOT = new[] { DOT }; var words = (from xe in xdoc.Root.Element(elementName).Elements() let v = xe.Value.Trim() //-регистрозависимый!!!-.ToUpperInvariant() where !string.IsNullOrEmpty(v) select v ).ToArray(); var abbreviations = new List <string[]>(); foreach (var word in words) { var words_by_dot = word.Split(SPLIT_BY_DOT, StringSplitOptions.RemoveEmptyEntries); for (int i = 0, len = words_by_dot.Length; i < len; i++) { var w = words_by_dot[i].Trim(); if (string.IsNullOrEmpty(w)) { throw (new InvalidDataException("Wrong data in <abbreviation> section => [empty subitem], word: '" + word + "', pos-tagger-resources-xml-file: '" + posTaggerResourcesXmlFilename + '\'')); } words_by_dot[i] = w + DOT; } var result = open_abbreviation_permutations(words_by_dot, true); //if ( result == null ) //throw (new InvalidDataException( "Wrong data in <abbreviation> section => [more then 4 subitems in abbreviation], word: '" + word + "', pos-tagger-resources-xml-file: '" + posTaggerResourcesXmlFilename + '\'' )); abbreviations.AddRange(result); } var abbreviationSearcher = new AhoCorasick(abbreviations); return(abbreviationSearcher); }