public EntityMentionsAnnotator(string name, Properties props) { // note: used in annotate.properties // if the user has supplied custom CoreAnnotations for the ner tags and entity mentions override the default keys try { if (props.Contains(name + ".nerCoreAnnotation")) { nerCoreAnnotationClass = (Type)Sharpen.Runtime.GetType(props.GetProperty(name + ".nerCoreAnnotation")); } if (props.Contains(name + ".nerNormalizedCoreAnnotation")) { nerNormalizedCoreAnnotationClass = (Type)Sharpen.Runtime.GetType(props.GetProperty(name + ".nerNormalizedCoreAnnotation")); } if (props.Contains(name + ".mentionsCoreAnnotation")) { mentionsCoreAnnotationClass = (Type)Sharpen.Runtime.GetType(props.GetProperty(name + ".mentionsCoreAnnotation")); } } catch (TypeLoadException e) { log.Error(e.Message); } chunkIdentifier = new LabeledChunkIdentifier(); doAcronyms = bool.ParseBoolean(props.GetProperty(name + ".acronyms", props.GetProperty("acronyms", "false"))); // set up language info, this is needed for handling creating pronominal mentions entityMentionsLanguage = LanguageInfo.GetLanguageFromString(props.GetProperty(name + ".language", "en")); }
/// <summary>Check if language is a segmenter language, return boolean.</summary> public static bool IsSegmenterLanguage(LanguageInfo.HumanLanguage language) { return(language == LanguageInfo.HumanLanguage.Arabic || language == LanguageInfo.HumanLanguage.Chinese); }
/// <exception cref="System.IO.IOException"/> public NERCombinerAnnotator(Properties properties) { IList <string> models = new List <string>(); string modelNames = properties.GetProperty("ner.model"); if (modelNames == null) { modelNames = DefaultPaths.DefaultNerThreeclassModel + ',' + DefaultPaths.DefaultNerMucModel + ',' + DefaultPaths.DefaultNerConllModel; } if (!modelNames.IsEmpty()) { Sharpen.Collections.AddAll(models, Arrays.AsList(modelNames.Split(","))); } if (models.IsEmpty()) { // Allow for no real NER model - can just use numeric classifiers or SUTime. // Have to unset ner.model, so unlikely that people got here by accident. log.Info("WARNING: no NER models specified"); } bool applyNumericClassifiers = PropertiesUtils.GetBool(properties, NERClassifierCombiner.ApplyNumericClassifiersProperty, NERClassifierCombiner.ApplyNumericClassifiersDefault); bool applyRegexner = PropertiesUtils.GetBool(properties, NERClassifierCombiner.ApplyGazetteProperty, NERClassifierCombiner.ApplyGazetteDefault); bool useSUTime = PropertiesUtils.GetBool(properties, NumberSequenceClassifier.UseSutimeProperty, NumberSequenceClassifier.UseSutimeDefault); // option for setting doc date to be the present during each annotation usePresentDateForDocDate = PropertiesUtils.GetBool(properties, "ner." + "usePresentDateForDocDate", false); // option for setting doc date from a provided string providedDocDate = PropertiesUtils.GetString(properties, "ner." + "providedDocDate", string.Empty); Pattern p = Pattern.Compile("[0-9]{4}\\-[0-9]{2}\\-[0-9]{2}"); Matcher m = p.Matcher(providedDocDate); if (!m.Matches()) { providedDocDate = string.Empty; } NERClassifierCombiner.Language nerLanguage = NERClassifierCombiner.Language.FromString(PropertiesUtils.GetString(properties, NERClassifierCombiner.NerLanguageProperty, null), NERClassifierCombiner.NerLanguageDefault); bool verbose = PropertiesUtils.GetBool(properties, "ner." + "verbose", false); string[] loadPaths = Sharpen.Collections.ToArray(models, new string[models.Count]); Properties combinerProperties = PropertiesUtils.ExtractSelectedProperties(properties, NERClassifierCombiner.DefaultPassDownProperties); if (useSUTime) { // Make sure SUTime parameters are included Properties sutimeProps = PropertiesUtils.ExtractPrefixedProperties(properties, NumberSequenceClassifier.SutimeProperty + '.', true); PropertiesUtils.OverWriteProperties(combinerProperties, sutimeProps); } NERClassifierCombiner nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, nerLanguage, useSUTime, applyRegexner, combinerProperties, loadPaths); this.nThreads = PropertiesUtils.GetInt(properties, "ner.nthreads", PropertiesUtils.GetInt(properties, "nthreads", 1)); this.maxTime = PropertiesUtils.GetLong(properties, "ner.maxtime", 0); this.maxSentenceLength = PropertiesUtils.GetInt(properties, "ner.maxlen", int.MaxValue); this.language = LanguageInfo.GetLanguageFromString(PropertiesUtils.GetString(properties, "ner.language", "en")); // in case of Spanish, use the Spanish number regexner annotator if (language.Equals(LanguageInfo.HumanLanguage.Spanish)) { Properties spanishNumberRegexNerProperties = new Properties(); spanishNumberRegexNerProperties["spanish.number.regexner.mapping"] = spanishNumberRegexRules; spanishNumberRegexNerProperties["spanish.number.regexner.validpospattern"] = "^(NUM).*"; spanishNumberRegexNerProperties["spanish.number.regexner.ignorecase"] = "true"; spanishNumberAnnotator = new TokensRegexNERAnnotator("spanish.number.regexner", spanishNumberRegexNerProperties); } // set up fine grained ner SetUpFineGrainedNER(properties); // set up additional rules ner SetUpAdditionalRulesNER(properties); // set up entity mentions SetUpEntityMentionBuilding(properties); Verbose = verbose; this.ner = nerCombiner; }
/// <summary>Create a new KBP annotator from the given properties.</summary> /// <param name="props">The properties to use when creating this extractor.</param> public KBPAnnotator(string name, Properties props) { //@ArgumentParser.Option(name="kbp.language", gloss="language for kbp") //private String language = "english"; /* * A TokensRegexNER annotator for the special KBP NER types (case-sensitive). */ //private final TokensRegexNERAnnotator casedNER; /* * A TokensRegexNER annotator for the special KBP NER types (case insensitive). */ //private final TokensRegexNERAnnotator caselessNER; // Parse standard properties ArgumentParser.FillOptions(this, name, props); //Locale kbpLanguage = //(language.toLowerCase().equals("zh") || language.toLowerCase().equals("chinese")) ? //Locale.CHINESE : Locale.ENGLISH ; kbpProperties = props; try { List <IKBPRelationExtractor> extractors = new List <IKBPRelationExtractor>(); // add tokensregex rules if (!tokensregexdir.Equals(NotProvided)) { extractors.Add(new KBPTokensregexExtractor(tokensregexdir, Verbose)); } // add semgrex rules if (!semgrexdir.Equals(NotProvided)) { extractors.Add(new KBPSemgrexExtractor(semgrexdir, Verbose)); } // attempt to add statistical model if (!model.Equals(NotProvided)) { log.Info("Loading KBP classifier from: " + model); object @object = IOUtils.ReadObjectFromURLOrClasspathOrFileSystem(model); IKBPRelationExtractor statisticalExtractor; if (@object is LinearClassifier) { //noinspection unchecked statisticalExtractor = new KBPStatisticalExtractor((IClassifier <string, string>)@object); } else { if (@object is KBPStatisticalExtractor) { statisticalExtractor = (KBPStatisticalExtractor)@object; } else { throw new InvalidCastException(@object.GetType() + " cannot be cast into a " + typeof(KBPStatisticalExtractor)); } } extractors.Add(statisticalExtractor); } // build extractor this.extractor = new KBPEnsembleExtractor(Sharpen.Collections.ToArray(extractors, new IKBPRelationExtractor[extractors.Count])); // set maximum length of sentence to operate on maxLength = System.Convert.ToInt32(props.GetProperty("kbp.maxlen", "-1")); } catch (Exception e) { throw new RuntimeIOException(e); } // set up map for converting between older and new KBP relation names relationNameConversionMap = new Dictionary <string, string>(); relationNameConversionMap["org:dissolved"] = "org:date_dissolved"; relationNameConversionMap["org:founded"] = "org:date_founded"; relationNameConversionMap["org:number_of_employees/members"] = "org:number_of_employees_members"; relationNameConversionMap["org:political/religious_affiliation"] = "org:political_religious_affiliation"; relationNameConversionMap["org:top_members/employees"] = "org:top_members_employees"; relationNameConversionMap["per:member_of"] = "per:employee_or_member_of"; relationNameConversionMap["per:employee_of"] = "per:employee_or_member_of"; relationNameConversionMap["per:stateorprovinces_of_residence"] = "per:statesorprovinces_of_residence"; // set up KBP language kbpLanguage = LanguageInfo.GetLanguageFromString(props.GetProperty("kbp.language", "en")); // build the Spanish coref system if necessary if (LanguageInfo.HumanLanguage.Spanish.Equals(kbpLanguage)) { spanishCorefSystem = new KBPBasicSpanishCorefSystem(); } }