/// <summary> /// This factory method is used to create the NERClassifierCombiner used in NERCombinerAnnotator /// (and, thence, in StanfordCoreNLP). /// </summary> /// <param name="name"> /// A "x.y" format property name prefix (the "x" part). This is commonly null, /// and then "ner" is used. If it is the empty string, then no property prefix is used. /// </param> /// <param name="passDownProperties"> /// Property names for which the property should be passed down /// to the NERClassifierCombiner. The default is not to pass down, but pass down is /// useful for things like charset encoding. /// </param> /// <param name="properties"> /// Various properties, including a list in "ner.model". /// The used ones start with name + "." or are in passDownProperties /// </param> /// <returns>An NERClassifierCombiner with the given properties</returns> public static NERClassifierCombiner CreateNERClassifierCombiner(string name, ICollection <string> passDownProperties, Properties properties) { string prefix = (name == null) ? "ner." : name.IsEmpty() ? string.Empty : name + '.'; string modelNames = properties.GetProperty(prefix + "model"); if (modelNames == null) { modelNames = DefaultPaths.DefaultNerThreeclassModel + ',' + DefaultPaths.DefaultNerMucModel + ',' + DefaultPaths.DefaultNerConllModel; } // but modelNames can still be empty string is set explicitly to be empty! string[] models; if (!modelNames.IsEmpty()) { models = modelNames.Split(","); } else { // Allow for no real NER model - can just use numeric classifiers or SUTime log.Info("WARNING: no NER models specified"); models = StringUtils.EmptyStringArray; } NERClassifierCombiner nerCombiner; try { bool applyNumericClassifiers = PropertiesUtils.GetBool(properties, prefix + ApplyNumericClassifiersPropertyBase, ApplyNumericClassifiersDefault); bool useSUTime = PropertiesUtils.GetBool(properties, prefix + NumberSequenceClassifier.UseSutimePropertyBase, NumberSequenceClassifier.UseSutimeDefault); bool applyRegexner = PropertiesUtils.GetBool(properties, NERClassifierCombiner.ApplyGazetteProperty, NERClassifierCombiner.ApplyGazetteDefault); Properties combinerProperties; if (passDownProperties != null) { combinerProperties = PropertiesUtils.ExtractSelectedProperties(properties, passDownProperties); if (useSUTime) { // Make sure SUTime parameters are included Properties sutimeProps = PropertiesUtils.ExtractPrefixedProperties(properties, NumberSequenceClassifier.SutimeProperty + ".", true); PropertiesUtils.OverWriteProperties(combinerProperties, sutimeProps); } } else { // if passDownProperties is null, just pass everything through combinerProperties = properties; } //Properties combinerProperties = PropertiesUtils.extractSelectedProperties(properties, passDownProperties); NERClassifierCombiner.Language nerLanguage = NERClassifierCombiner.Language.FromString(properties.GetProperty(prefix + "language"), NERClassifierCombiner.Language.English); nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, nerLanguage, useSUTime, applyRegexner, combinerProperties, models); } catch (IOException e) { throw new RuntimeIOException(e); } return(nerCombiner); }
/// <exception cref="System.IO.IOException"/> public NERCombinerAnnotator(Properties properties) { IList <string> models = new List <string>(); string modelNames = properties.GetProperty("ner.model"); if (modelNames == null) { modelNames = DefaultPaths.DefaultNerThreeclassModel + ',' + DefaultPaths.DefaultNerMucModel + ',' + DefaultPaths.DefaultNerConllModel; } if (!modelNames.IsEmpty()) { Sharpen.Collections.AddAll(models, Arrays.AsList(modelNames.Split(","))); } if (models.IsEmpty()) { // Allow for no real NER model - can just use numeric classifiers or SUTime. // Have to unset ner.model, so unlikely that people got here by accident. log.Info("WARNING: no NER models specified"); } bool applyNumericClassifiers = PropertiesUtils.GetBool(properties, NERClassifierCombiner.ApplyNumericClassifiersProperty, NERClassifierCombiner.ApplyNumericClassifiersDefault); bool applyRegexner = PropertiesUtils.GetBool(properties, NERClassifierCombiner.ApplyGazetteProperty, NERClassifierCombiner.ApplyGazetteDefault); bool useSUTime = PropertiesUtils.GetBool(properties, NumberSequenceClassifier.UseSutimeProperty, NumberSequenceClassifier.UseSutimeDefault); // option for setting doc date to be the present during each annotation usePresentDateForDocDate = PropertiesUtils.GetBool(properties, "ner." + "usePresentDateForDocDate", false); // option for setting doc date from a provided string providedDocDate = PropertiesUtils.GetString(properties, "ner." + "providedDocDate", string.Empty); Pattern p = Pattern.Compile("[0-9]{4}\\-[0-9]{2}\\-[0-9]{2}"); Matcher m = p.Matcher(providedDocDate); if (!m.Matches()) { providedDocDate = string.Empty; } NERClassifierCombiner.Language nerLanguage = NERClassifierCombiner.Language.FromString(PropertiesUtils.GetString(properties, NERClassifierCombiner.NerLanguageProperty, null), NERClassifierCombiner.NerLanguageDefault); bool verbose = PropertiesUtils.GetBool(properties, "ner." + "verbose", false); string[] loadPaths = Sharpen.Collections.ToArray(models, new string[models.Count]); Properties combinerProperties = PropertiesUtils.ExtractSelectedProperties(properties, NERClassifierCombiner.DefaultPassDownProperties); if (useSUTime) { // Make sure SUTime parameters are included Properties sutimeProps = PropertiesUtils.ExtractPrefixedProperties(properties, NumberSequenceClassifier.SutimeProperty + '.', true); PropertiesUtils.OverWriteProperties(combinerProperties, sutimeProps); } NERClassifierCombiner nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, nerLanguage, useSUTime, applyRegexner, combinerProperties, loadPaths); this.nThreads = PropertiesUtils.GetInt(properties, "ner.nthreads", PropertiesUtils.GetInt(properties, "nthreads", 1)); this.maxTime = PropertiesUtils.GetLong(properties, "ner.maxtime", 0); this.maxSentenceLength = PropertiesUtils.GetInt(properties, "ner.maxlen", int.MaxValue); this.language = LanguageInfo.GetLanguageFromString(PropertiesUtils.GetString(properties, "ner.language", "en")); // in case of Spanish, use the Spanish number regexner annotator if (language.Equals(LanguageInfo.HumanLanguage.Spanish)) { Properties spanishNumberRegexNerProperties = new Properties(); spanishNumberRegexNerProperties["spanish.number.regexner.mapping"] = spanishNumberRegexRules; spanishNumberRegexNerProperties["spanish.number.regexner.validpospattern"] = "^(NUM).*"; spanishNumberRegexNerProperties["spanish.number.regexner.ignorecase"] = "true"; spanishNumberAnnotator = new TokensRegexNERAnnotator("spanish.number.regexner", spanishNumberRegexNerProperties); } // set up fine grained ner SetUpFineGrainedNER(properties); // set up additional rules ner SetUpAdditionalRulesNER(properties); // set up entity mentions SetUpEntityMentionBuilding(properties); Verbose = verbose; this.ner = nerCombiner; }