public Tagger(IList <LexiconEntry> lexicon, StringMixOptions options) { if (lexicon == null) { throw new ArgumentNullException("lexicon"); } _lexicon = new Dictionary <string, LexiconEntry>(DEFAULT_CAPACITY); _options = options; string itemvalue = String.Empty; foreach (var item in lexicon) { itemvalue = _options.MatchesAreCaseSensitive ? item.Value : item.Value.ToLower(); if (_lexicon.ContainsKey(itemvalue)) { foreach (var tag in item.Tags) { if (!_lexicon[itemvalue].Tags.Contains(tag)) { _lexicon[itemvalue].Tags.AddRange(item.Tags); } } } else { _lexicon.Add(itemvalue, item); } } }
public Tagger GetBasicTagger(List <LexiconEntry> lexicon, StringMixOptions options) { return(new Tagger(lexicon, options)); }
/// <summary> /// For a given string, lexicon and processing options, provide a list of tagged tokens. /// </summary> /// <param name="str"> /// The string that shall be operated upon /// </param> /// <param name="lexicon"> /// See LexiconEntry /// </param> /// <param name="options"> /// See StringMixOptions: basically string processing options for delimiters, casing, etc /// </param> /// <returns> /// A list of Tagged Tokens. Think of tokens as the terms seen in text that would /// be processed. If the incoming string were "Fred Flintstone" there are two terms /// --tokens--. Using the lexicon, the library attaches tags --meaning-- to these terms. /// These meanings can be turned into sequences --Patterns-- that then can be further processed /// for matches. /// </returns> public static List <TaggedToken> Tokenize(this String str, List <LexiconEntry> lexicon, StringMixOptions options) { return(Tokenize(str, new Tagger(lexicon, options))); }
/// <summary> /// A convenience method that performs the full chain of tokenize, tag, Match, and Transform /// to a string object. Its the equivilent to doing: /// /// "Fred Flintstone".Tokenize(lexicon, options).Match("FL").Transform<Name>(NameTransformer); /// /// </summary> /// <typeparam name="T"> /// The type that should be returned by this call /// </typeparam> /// <param name="str"> /// The string being operated upon /// </param> /// <param name="lexicon"> /// See LexiconEntry /// </param> /// <param name="options"> /// The StringMixOptions that you could have provided to the tokenize method. /// </param> /// <param name="MatchRegEx"> /// The Regular Expression that will be tested against all of the patterns of the tokens in the /// target string. Can be thought of as the same as a call to .Match(tokens, [match criteria]) /// </param> /// <param name="transformer"> /// The ITransformer[T] implementation that will be used to convert the matchset to the target object type /// </param> /// <returns> /// an object of type T /// </returns> public static T Transform <T>(this String str, List <LexiconEntry> lexicon, StringMixOptions options, string MatchRegEx, ITransformer <T> transformer) where T : new() { return(Tokenize(str, lexicon, options) .Match(MatchRegEx) .Transform <T>(transformer)); }