Ejemplo n.º 1
0
        /// <summary>
        /// This factory method is used to create the NERClassifierCombiner used in NERCombinerAnnotator
        /// (and, thence, in StanfordCoreNLP).
        /// </summary>
        /// <param name="name">
        /// A "x.y" format property name prefix (the "x" part). This is commonly null,
        /// and then "ner" is used.  If it is the empty string, then no property prefix is used.
        /// </param>
        /// <param name="passDownProperties">
        /// Property names for which the property should be passed down
        /// to the NERClassifierCombiner. The default is not to pass down, but pass down is
        /// useful for things like charset encoding.
        /// </param>
        /// <param name="properties">
        /// Various properties, including a list in "ner.model".
        /// The used ones start with name + "." or are in passDownProperties
        /// </param>
        /// <returns>An NERClassifierCombiner with the given properties</returns>
        public static NERClassifierCombiner CreateNERClassifierCombiner(string name, ICollection <string> passDownProperties, Properties properties)
        {
            string prefix     = (name == null) ? "ner." : name.IsEmpty() ? string.Empty : name + '.';
            string modelNames = properties.GetProperty(prefix + "model");

            if (modelNames == null)
            {
                modelNames = DefaultPaths.DefaultNerThreeclassModel + ',' + DefaultPaths.DefaultNerMucModel + ',' + DefaultPaths.DefaultNerConllModel;
            }
            // but modelNames can still be empty string is set explicitly to be empty!
            string[] models;
            if (!modelNames.IsEmpty())
            {
                models = modelNames.Split(",");
            }
            else
            {
                // Allow for no real NER model - can just use numeric classifiers or SUTime
                log.Info("WARNING: no NER models specified");
                models = StringUtils.EmptyStringArray;
            }
            NERClassifierCombiner nerCombiner;

            try
            {
                bool       applyNumericClassifiers = PropertiesUtils.GetBool(properties, prefix + ApplyNumericClassifiersPropertyBase, ApplyNumericClassifiersDefault);
                bool       useSUTime     = PropertiesUtils.GetBool(properties, prefix + NumberSequenceClassifier.UseSutimePropertyBase, NumberSequenceClassifier.UseSutimeDefault);
                bool       applyRegexner = PropertiesUtils.GetBool(properties, NERClassifierCombiner.ApplyGazetteProperty, NERClassifierCombiner.ApplyGazetteDefault);
                Properties combinerProperties;
                if (passDownProperties != null)
                {
                    combinerProperties = PropertiesUtils.ExtractSelectedProperties(properties, passDownProperties);
                    if (useSUTime)
                    {
                        // Make sure SUTime parameters are included
                        Properties sutimeProps = PropertiesUtils.ExtractPrefixedProperties(properties, NumberSequenceClassifier.SutimeProperty + ".", true);
                        PropertiesUtils.OverWriteProperties(combinerProperties, sutimeProps);
                    }
                }
                else
                {
                    // if passDownProperties is null, just pass everything through
                    combinerProperties = properties;
                }
                //Properties combinerProperties = PropertiesUtils.extractSelectedProperties(properties, passDownProperties);
                NERClassifierCombiner.Language nerLanguage = NERClassifierCombiner.Language.FromString(properties.GetProperty(prefix + "language"), NERClassifierCombiner.Language.English);
                nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, nerLanguage, useSUTime, applyRegexner, combinerProperties, models);
            }
            catch (IOException e)
            {
                throw new RuntimeIOException(e);
            }
            return(nerCombiner);
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        /// <exception cref="System.InvalidCastException"/>
        public ClassifierCombiner(ObjectInputStream ois, Properties props)
            : base(PropertiesUtils.OverWriteProperties((Properties)ois.ReadObject(), props))
        {
            // constructor for building a ClassifierCombiner from an ObjectInputStream
            // read the initial Properties out of the ObjectInputStream so you can properly start the AbstractSequenceClassifier
            // note now we load in props from command line and overwrite any that are given for command line
            // read another copy of initProps that I have helpfully included
            // TODO: probably set initProps in AbstractSequenceClassifier to avoid this writing twice thing, its hacky
            this.initProps = PropertiesUtils.OverWriteProperties((Properties)ois.ReadObject(), props);
            // read the initLoadPaths
            this.initLoadPaths = (List <string>)ois.ReadObject();
            // read the combinationMode from the serialized version
            string cm = (string)ois.ReadObject();

            // see if there is a commandline override for the combinationMode, else set newCM to the serialized version
            ClassifierCombiner.CombinationMode newCM;
            if (props.GetProperty("ner.combinationMode") != null)
            {
                // there is a possible commandline override, have to see if its valid
                try
                {
                    // see if the commandline has a proper value
                    newCM = ClassifierCombiner.CombinationMode.ValueOf(props.GetProperty("ner.combinationMode"));
                }
                catch (ArgumentException)
                {
                    // the commandline override did not have a proper value, so just use the serialized version
                    newCM = ClassifierCombiner.CombinationMode.ValueOf(cm);
                }
            }
            else
            {
                // there was no commandline override given, so just use the serialized version
                newCM = ClassifierCombiner.CombinationMode.ValueOf(cm);
            }
            this.combinationMode = newCM;
            // read in the base classifiers
            int numClassifiers = ois.ReadInt();

            // set up the list of base classifiers
            this.baseClassifiers = new List <AbstractSequenceClassifier <IN> >();
            int i = 0;

            while (i < numClassifiers)
            {
                try
                {
                    log.Info("loading CRF...");
                    CRFClassifier <IN> newCRF = ErasureUtils.UncheckedCast(CRFClassifier.GetClassifier(ois, props));
                    baseClassifiers.Add(newCRF);
                    i++;
                }
                catch (Exception)
                {
                    try
                    {
                        log.Info("loading CMM...");
                        CMMClassifier newCMM = ErasureUtils.UncheckedCast(CMMClassifier.GetClassifier(ois, props));
                        baseClassifiers.Add(newCMM);
                        i++;
                    }
                    catch (Exception ex)
                    {
                        throw new IOException("Couldn't load classifier!", ex);
                    }
                }
            }
        }
        /// <exception cref="System.IO.IOException"/>
        public NERCombinerAnnotator(Properties properties)
        {
            IList <string> models     = new List <string>();
            string         modelNames = properties.GetProperty("ner.model");

            if (modelNames == null)
            {
                modelNames = DefaultPaths.DefaultNerThreeclassModel + ',' + DefaultPaths.DefaultNerMucModel + ',' + DefaultPaths.DefaultNerConllModel;
            }
            if (!modelNames.IsEmpty())
            {
                Sharpen.Collections.AddAll(models, Arrays.AsList(modelNames.Split(",")));
            }
            if (models.IsEmpty())
            {
                // Allow for no real NER model - can just use numeric classifiers or SUTime.
                // Have to unset ner.model, so unlikely that people got here by accident.
                log.Info("WARNING: no NER models specified");
            }
            bool applyNumericClassifiers = PropertiesUtils.GetBool(properties, NERClassifierCombiner.ApplyNumericClassifiersProperty, NERClassifierCombiner.ApplyNumericClassifiersDefault);
            bool applyRegexner           = PropertiesUtils.GetBool(properties, NERClassifierCombiner.ApplyGazetteProperty, NERClassifierCombiner.ApplyGazetteDefault);
            bool useSUTime = PropertiesUtils.GetBool(properties, NumberSequenceClassifier.UseSutimeProperty, NumberSequenceClassifier.UseSutimeDefault);

            // option for setting doc date to be the present during each annotation
            usePresentDateForDocDate = PropertiesUtils.GetBool(properties, "ner." + "usePresentDateForDocDate", false);
            // option for setting doc date from a provided string
            providedDocDate = PropertiesUtils.GetString(properties, "ner." + "providedDocDate", string.Empty);
            Pattern p = Pattern.Compile("[0-9]{4}\\-[0-9]{2}\\-[0-9]{2}");
            Matcher m = p.Matcher(providedDocDate);

            if (!m.Matches())
            {
                providedDocDate = string.Empty;
            }
            NERClassifierCombiner.Language nerLanguage = NERClassifierCombiner.Language.FromString(PropertiesUtils.GetString(properties, NERClassifierCombiner.NerLanguageProperty, null), NERClassifierCombiner.NerLanguageDefault);
            bool verbose = PropertiesUtils.GetBool(properties, "ner." + "verbose", false);

            string[]   loadPaths          = Sharpen.Collections.ToArray(models, new string[models.Count]);
            Properties combinerProperties = PropertiesUtils.ExtractSelectedProperties(properties, NERClassifierCombiner.DefaultPassDownProperties);

            if (useSUTime)
            {
                // Make sure SUTime parameters are included
                Properties sutimeProps = PropertiesUtils.ExtractPrefixedProperties(properties, NumberSequenceClassifier.SutimeProperty + '.', true);
                PropertiesUtils.OverWriteProperties(combinerProperties, sutimeProps);
            }
            NERClassifierCombiner nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, nerLanguage, useSUTime, applyRegexner, combinerProperties, loadPaths);

            this.nThreads          = PropertiesUtils.GetInt(properties, "ner.nthreads", PropertiesUtils.GetInt(properties, "nthreads", 1));
            this.maxTime           = PropertiesUtils.GetLong(properties, "ner.maxtime", 0);
            this.maxSentenceLength = PropertiesUtils.GetInt(properties, "ner.maxlen", int.MaxValue);
            this.language          = LanguageInfo.GetLanguageFromString(PropertiesUtils.GetString(properties, "ner.language", "en"));
            // in case of Spanish, use the Spanish number regexner annotator
            if (language.Equals(LanguageInfo.HumanLanguage.Spanish))
            {
                Properties spanishNumberRegexNerProperties = new Properties();
                spanishNumberRegexNerProperties["spanish.number.regexner.mapping"]         = spanishNumberRegexRules;
                spanishNumberRegexNerProperties["spanish.number.regexner.validpospattern"] = "^(NUM).*";
                spanishNumberRegexNerProperties["spanish.number.regexner.ignorecase"]      = "true";
                spanishNumberAnnotator = new TokensRegexNERAnnotator("spanish.number.regexner", spanishNumberRegexNerProperties);
            }
            // set up fine grained ner
            SetUpFineGrainedNER(properties);
            // set up additional rules ner
            SetUpAdditionalRulesNER(properties);
            // set up entity mentions
            SetUpEntityMentionBuilding(properties);
            Verbose  = verbose;
            this.ner = nerCombiner;
        }