예제 #1
0
        /// <summary>A fast, rule-based tokenizer for Modern Standard French.</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Modern Standard French.
        /// Performs punctuation splitting and light tokenization by default.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It assumes that the input
        /// file is delimited by the system line separator. The output will be equivalently
        /// delimited.
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            // Lexer options
            ITokenizerFactory <CoreLabel> tf = options.Contains("ftb") ? FrenchTokenizer.FtbFactory() : FrenchTokenizer.Factory();
            string orthoOptions = options.GetProperty("options", string.Empty);

            // When called from this main method, split on newline. No options for
            // more granular sentence splitting.
            orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
            tf.SetOptions(orthoOptions);
            // Other options
            string encoding = options.GetProperty("encoding", "UTF-8");
            bool   toLower  = PropertiesUtils.GetBool(options, "lowerCase", false);
            // Read the file from stdin
            int  nLines    = 0;
            int  nTokens   = 0;
            long startTime = Runtime.NanoTime();

            try
            {
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(FrenchLexer.NewlineToken))
                    {
                        ++nLines;
                        printSpace = false;
                        System.Console.Out.WriteLine();
                    }
                    else
                    {
                        if (printSpace)
                        {
                            System.Console.Out.Write(" ");
                        }
                        string outputToken = toLower ? word.ToLower(Locale.French) : word;
                        System.Console.Out.Write(outputToken);
                        printSpace = true;
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                log.Error(e);
            }
            long   elapsedTime = Runtime.NanoTime() - startTime;
            double linesPerSec = (double)nLines / (elapsedTime / 1e9);

            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
        }
예제 #2
0
        public override bool SetOptions(Properties opts)
        {
            bool ret = base.SetOptions(opts);

            if (opts.Contains(ConfigParser.paramSplit))
            {
                string splitFileName = opts.GetProperty(ConfigParser.paramSplit);
                splitSet = MakeSplitSet(splitFileName);
            }
            CcTagset = PropertiesUtils.GetBool(opts, ConfigParser.paramCCTagset, false);
            treebank = new MemoryTreebank(new FrenchXMLTreeReaderFactory(CcTagset), FrenchTreebankLanguagePack.FtbEncoding);
            if (lexMapper == null)
            {
                lexMapper = new DefaultMapper();
                lexMapper.Setup(null, lexMapOptions.Split(","));
            }
            if (pathsToMappings.Count != 0)
            {
                if (posMapper == null)
                {
                    posMapper = new DefaultMapper();
                }
                foreach (File path in pathsToMappings)
                {
                    posMapper.Setup(path);
                }
            }
            return(ret);
        }
예제 #3
0
            public WhitespaceTokenizerFactory(ILexedTokenFactory <T> factory, string options)
            {
                this.factory = factory;
                Properties prop = StringUtils.StringToProperties(options);

                this.tokenizeNLs = PropertiesUtils.GetBool(prop, "tokenizeNLs", false);
            }
예제 #4
0
 public CorefAnnotator(Properties props)
 {
     this.props = props;
     try
     {
         // if user tries to run with coref.language = ENGLISH and coref.algorithm = hybrid, throw Exception
         // we do not support those settings at this time
         if (CorefProperties.Algorithm(props).Equals(CorefProperties.CorefAlgorithmType.Hybrid) && CorefProperties.GetLanguage(props).Equals(Locale.English))
         {
             log.Error("Error: coref.algorithm=hybrid is not supported for English, " + "please change coref.algorithm or coref.language");
             throw new Exception();
         }
         // suppress
         props.SetProperty("coref.printConLLLoadingMessage", "false");
         corefSystem = new CorefSystem(props);
         props.Remove("coref.printConLLLoadingMessage");
     }
     catch (Exception e)
     {
         log.Error("Error creating CorefAnnotator...terminating pipeline construction!");
         log.Error(e);
         throw new Exception(e);
     }
     // unless custom mention detection is set, just use the default coref mention detector
     performMentionDetection = !PropertiesUtils.GetBool(props, "coref.useCustomMentionDetection", false);
     if (performMentionDetection)
     {
         mentionAnnotator = new CorefMentionAnnotator(props);
     }
 }
        public ArabicSegmenterAnnotator(string name, Properties props)
        {
            string model = null;
            // Keep only the properties that apply to this annotator
            Properties modelProps = new Properties();
            string     desiredKey = name + '.';

            foreach (string key in props.StringPropertyNames())
            {
                if (key.StartsWith(desiredKey))
                {
                    // skip past name and the subsequent "."
                    string modelKey = Sharpen.Runtime.Substring(key, desiredKey.Length);
                    if (modelKey.Equals("model"))
                    {
                        model = props.GetProperty(key);
                    }
                    else
                    {
                        modelProps.SetProperty(modelKey, props.GetProperty(key));
                    }
                }
            }
            this.Verbose = PropertiesUtils.GetBool(props, name + ".verbose", false);
            if (model == null)
            {
                throw new Exception("Expected a property " + name + ".model");
            }
            LoadModel(model, modelProps);
        }
예제 #6
0
        public virtual void Init(string name, Properties props)
        {
            string prefix         = (name == null) ? string.Empty : name + ".";
            string delimiterRegex = props.GetProperty(prefix + "delimiter");

            if (delimiterRegex != null)
            {
                delimiterPattern = Pattern.Compile(delimiterRegex);
            }
            replaceWhitespace = PropertiesUtils.GetBool(props, prefix + "replaceWhitespace", replaceWhitespace);
            string mapString = props.GetProperty(prefix + "columns");

            tokensAnnotationClassName = props.GetProperty(prefix + "tokens", "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation");
            string tokenFactoryClassName = props.GetProperty(prefix + "tokenFactory");

            if (tokenFactoryClassName != null)
            {
                try
                {
                    this.tokenFactory = (ICoreTokenFactory <IN>)System.Activator.CreateInstance(Sharpen.Runtime.GetType(tokenFactoryClassName));
                }
                catch (Exception e)
                {
                    throw new Exception(e);
                }
            }
            else
            {
                this.tokenFactory = (ICoreTokenFactory <IN>) new CoreLabelTokenFactory();
            }
            Init(mapString, this.tokenFactory, this.tokensAnnotationClassName);
        }
예제 #7
0
            public virtual ITokenizer <T> GetTokenizer(Reader r, string extraOptions)
            {
                Properties prop             = StringUtils.StringToProperties(extraOptions);
                bool       tokenizeNewlines = PropertiesUtils.GetBool(prop, "tokenizeNLs", this.tokenizeNLs);

                return(new WhitespaceTokenizer <T>(factory, r, tokenizeNewlines));
            }
        /// <summary>
        /// Fix tree structure, phrasal categories and part-of-speech labels in newly expanded
        /// multi-word tokens.
        /// </summary>
        /// <exception cref="System.Exception"/>
        /// <exception cref="Java.Util.Concurrent.ExecutionException"/>
        private IList <Tree> FixMultiWordTokens(IList <Tree> trees)
        {
            bool ner = PropertiesUtils.GetBool(options, "ner", false);
            // Shared resources
            IFactory <TreeNormalizer> tnf = new _IFactory_389();
            ITreeFactory tf = new LabeledScoredTreeFactory();
            IThreadsafeProcessor <ICollection <Tree>, ICollection <Tree> > processor = new AnCoraProcessor.MultiWordProcessor(this, tnf, tf, ner);
            int availableProcessors = Runtime.GetRuntime().AvailableProcessors();
            MulticoreWrapper <ICollection <Tree>, ICollection <Tree> > wrapper = new MulticoreWrapper <ICollection <Tree>, ICollection <Tree> >(availableProcessors, processor, false);
            // Chunk our work so that parallelization is actually worth it
            int numChunks = availableProcessors * 20;
            IList <IList <Tree> > chunked = CollectionUtils.PartitionIntoFolds(trees, numChunks);
            IList <Tree>          ret     = new List <Tree>();

            foreach (ICollection <Tree> coll in chunked)
            {
                wrapper.Put(coll);
                while (wrapper.Peek())
                {
                    Sharpen.Collections.AddAll(ret, wrapper.Poll());
                }
            }
            wrapper.Join();
            while (wrapper.Peek())
            {
                Sharpen.Collections.AddAll(ret, wrapper.Poll());
            }
            return(ret);
        }
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(Usage());
                System.Environment.Exit(-1);
            }
            Properties options             = StringUtils.ArgsToProperties(args, ArgDefs());
            Language   language            = PropertiesUtils.Get(options, "l", Language.English, typeof(Language));
            ITreebankLangParserParams tlpp = language.@params;
            DiskTreebank tb            = null;
            string       encoding      = options.GetProperty("l", "UTF-8");
            bool         removeBracket = PropertiesUtils.GetBool(options, "b", false);

            tlpp.SetInputEncoding(encoding);
            tlpp.SetOutputEncoding(encoding);
            tb = tlpp.DiskTreebank();
            string[] files = options.GetProperty(string.Empty, string.Empty).Split("\\s+");
            if (files.Length != 0)
            {
                foreach (string filename in files)
                {
                    tb.LoadPath(filename);
                }
            }
            else
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            PrintWriter  pwo         = tlpp.Pw();
            string       startSymbol = tlpp.TreebankLanguagePack().StartSymbol();
            ITreeFactory tf          = new LabeledScoredTreeFactory();
            int          nTrees      = 0;

            foreach (Tree t in tb)
            {
                if (removeBracket)
                {
                    if (t.Value().Equals(startSymbol))
                    {
                        t = t.FirstChild();
                    }
                }
                else
                {
                    if (!t.Value().Equals(startSymbol))
                    {
                        //Add a bracket if it isn't already there
                        t = tf.NewTreeNode(startSymbol, Java.Util.Collections.SingletonList(t));
                    }
                }
                pwo.Println(t.ToString());
                nTrees++;
            }
            pwo.Close();
            System.Console.Error.Printf("Processed %d trees.%n", nTrees);
        }
예제 #10
0
        public Options(string name, Properties props)
        {
            includeRange     = PropertiesUtils.GetBool(props, name + ".includeRange", includeRange);
            markTimeRanges   = PropertiesUtils.GetBool(props, name + ".markTimeRanges", markTimeRanges);
            includeNested    = PropertiesUtils.GetBool(props, name + ".includeNested", includeNested);
            restrictToTimex3 = PropertiesUtils.GetBool(props, name + ".restrictToTimex3", restrictToTimex3);
            teRelHeurLevel   = Options.RelativeHeuristicLevel.ValueOf(props.GetProperty(name + ".teRelHeurLevel", teRelHeurLevel.ToString()));
            verbose          = PropertiesUtils.GetBool(props, name + ".verbose", verbose);
            // set default rules by SUTime language
            language = props.GetProperty(name + ".language", language);
            if (!languageToRulesFiles.Keys.Contains(language))
            {
                language = "english";
            }
            grammarFilename = languageToRulesFiles[language];
            // override if rules are set by properties
            grammarFilename  = props.GetProperty(name + ".rules", grammarFilename);
            searchForDocDate = PropertiesUtils.GetBool(props, name + ".searchForDocDate", searchForDocDate);
            string binderProperty = props.GetProperty(name + ".binders");
            int    nBinders;

            string[] binderClasses;
            if (binderProperty == null)
            {
                nBinders      = DefaultBinders.Length;
                binderClasses = DefaultBinders;
            }
            else
            {
                nBinders      = PropertiesUtils.GetInt(props, name + ".binders", 0);
                binderClasses = new string[nBinders];
                for (int i = 0; i < nBinders; ++i)
                {
                    string binderPrefix = name + ".binder." + (i + 1);
                    binderClasses[i] = props.GetProperty(binderPrefix);
                }
            }
            if (nBinders > 0 && Runtime.GetProperty("STS") == null)
            {
                binders = new Env.IBinder[nBinders];
                for (int i = 0; i < nBinders; i++)
                {
                    int    bi           = i + 1;
                    string binderPrefix = name + ".binder." + bi;
                    try
                    {
                        Type binderClass = Sharpen.Runtime.GetType(binderClasses[i]);
                        binderPrefix = binderPrefix + ".";
                        binders[i]   = (Env.IBinder)System.Activator.CreateInstance(binderClass);
                        binders[i].Init(binderPrefix, props);
                    }
                    catch (Exception ex)
                    {
                        throw new Exception("Error initializing binder " + bi, ex);
                    }
                }
            }
        }
예제 #11
0
        public ParserAnnotator(string annotatorName, Properties props)
        {
            string model = props.GetProperty(annotatorName + ".model", LexicalizedParser.DefaultParserLoc);

            if (model == null)
            {
                throw new ArgumentException("No model specified for Parser annotator " + annotatorName);
            }
            this.Verbose = PropertiesUtils.GetBool(props, annotatorName + ".debug", false);
            string[] flags = ConvertFlagsToArray(props.GetProperty(annotatorName + ".flags"));
            this.parser            = LoadModel(model, Verbose, flags);
            this.maxSentenceLength = PropertiesUtils.GetInt(props, annotatorName + ".maxlen", -1);
            string treeMapClass = props.GetProperty(annotatorName + ".treemap");

            if (treeMapClass == null)
            {
                this.treeMap = null;
            }
            else
            {
                this.treeMap = ReflectionLoading.LoadByReflection(treeMapClass, props);
            }
            this.maxParseTime = PropertiesUtils.GetLong(props, annotatorName + ".maxtime", -1);
            this.kBest        = PropertiesUtils.GetInt(props, annotatorName + ".kbest", 1);
            this.keepPunct    = PropertiesUtils.GetBool(props, annotatorName + ".keepPunct", true);
            string buildGraphsProperty = annotatorName + ".buildgraphs";

            if (!this.parser.GetTLPParams().SupportsBasicDependencies())
            {
                if (PropertiesUtils.GetBool(props, buildGraphsProperty))
                {
                    log.Info("WARNING: " + buildGraphsProperty + " set to true, but " + this.parser.GetTLPParams().GetType() + " does not support dependencies");
                }
                this.BuildGraphs = false;
            }
            else
            {
                this.BuildGraphs = PropertiesUtils.GetBool(props, buildGraphsProperty, true);
            }
            if (this.BuildGraphs)
            {
                bool generateOriginalDependencies = PropertiesUtils.GetBool(props, annotatorName + ".originalDependencies", false);
                parser.GetTLPParams().SetGenerateOriginalDependencies(generateOriginalDependencies);
                ITreebankLanguagePack tlp         = parser.GetTLPParams().TreebankLanguagePack();
                IPredicate <string>   punctFilter = this.keepPunct ? Filters.AcceptFilter() : tlp.PunctuationWordRejectFilter();
                this.gsf = tlp.GrammaticalStructureFactory(punctFilter, parser.GetTLPParams().TypedDependencyHeadFinder());
            }
            else
            {
                this.gsf = null;
            }
            this.nThreads = PropertiesUtils.GetInt(props, annotatorName + ".nthreads", PropertiesUtils.GetInt(props, "nthreads", 1));
            bool usesBinary = StanfordCoreNLP.UsesBinaryTrees(props);

            this.saveBinaryTrees   = PropertiesUtils.GetBool(props, annotatorName + ".binaryTrees", usesBinary);
            this.noSquash          = PropertiesUtils.GetBool(props, annotatorName + ".nosquash", false);
            this.extraDependencies = MetaClass.Cast(props.GetProperty(annotatorName + ".extradependencies", "NONE"), typeof(GrammaticalStructure.Extras));
        }
        public NumberAnnotator(string name, Properties props)
        {
            string property = name + "." + BackgroundSymbolProperty;

            BackgroundSymbol = props.GetProperty(property, DefaultBackgroundSymbol);
            bool useSUTime = PropertiesUtils.GetBool(props, NumberSequenceClassifier.UseSutimeProperty, NumberSequenceClassifier.UseSutimeDefault);

            Verbose = false;
            nsc     = new NumberSequenceClassifier(useSUTime);
        }
예제 #13
0
            public virtual ITokenizer <IHasWord> GetTokenizer(Reader r, string extraOptions)
            {
                bool tokenizeNewlines = this.tokenizeNLs;

                if (extraOptions != null)
                {
                    Properties prop = StringUtils.StringToProperties(extraOptions);
                    tokenizeNewlines = PropertiesUtils.GetBool(prop, "tokenizeNLs", this.tokenizeNLs);
                }
                return(new WordSegmentingTokenizer(segmenter, WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(r, tokenizeNewlines)));
            }
예제 #14
0
        /// <summary>
        /// This factory method is used to create the NERClassifierCombiner used in NERCombinerAnnotator
        /// (and, thence, in StanfordCoreNLP).
        /// </summary>
        /// <param name="name">
        /// A "x.y" format property name prefix (the "x" part). This is commonly null,
        /// and then "ner" is used.  If it is the empty string, then no property prefix is used.
        /// </param>
        /// <param name="passDownProperties">
        /// Property names for which the property should be passed down
        /// to the NERClassifierCombiner. The default is not to pass down, but pass down is
        /// useful for things like charset encoding.
        /// </param>
        /// <param name="properties">
        /// Various properties, including a list in "ner.model".
        /// The used ones start with name + "." or are in passDownProperties
        /// </param>
        /// <returns>An NERClassifierCombiner with the given properties</returns>
        public static NERClassifierCombiner CreateNERClassifierCombiner(string name, ICollection <string> passDownProperties, Properties properties)
        {
            string prefix     = (name == null) ? "ner." : name.IsEmpty() ? string.Empty : name + '.';
            string modelNames = properties.GetProperty(prefix + "model");

            if (modelNames == null)
            {
                modelNames = DefaultPaths.DefaultNerThreeclassModel + ',' + DefaultPaths.DefaultNerMucModel + ',' + DefaultPaths.DefaultNerConllModel;
            }
            // but modelNames can still be empty string is set explicitly to be empty!
            string[] models;
            if (!modelNames.IsEmpty())
            {
                models = modelNames.Split(",");
            }
            else
            {
                // Allow for no real NER model - can just use numeric classifiers or SUTime
                log.Info("WARNING: no NER models specified");
                models = StringUtils.EmptyStringArray;
            }
            NERClassifierCombiner nerCombiner;

            try
            {
                bool       applyNumericClassifiers = PropertiesUtils.GetBool(properties, prefix + ApplyNumericClassifiersPropertyBase, ApplyNumericClassifiersDefault);
                bool       useSUTime     = PropertiesUtils.GetBool(properties, prefix + NumberSequenceClassifier.UseSutimePropertyBase, NumberSequenceClassifier.UseSutimeDefault);
                bool       applyRegexner = PropertiesUtils.GetBool(properties, NERClassifierCombiner.ApplyGazetteProperty, NERClassifierCombiner.ApplyGazetteDefault);
                Properties combinerProperties;
                if (passDownProperties != null)
                {
                    combinerProperties = PropertiesUtils.ExtractSelectedProperties(properties, passDownProperties);
                    if (useSUTime)
                    {
                        // Make sure SUTime parameters are included
                        Properties sutimeProps = PropertiesUtils.ExtractPrefixedProperties(properties, NumberSequenceClassifier.SutimeProperty + ".", true);
                        PropertiesUtils.OverWriteProperties(combinerProperties, sutimeProps);
                    }
                }
                else
                {
                    // if passDownProperties is null, just pass everything through
                    combinerProperties = properties;
                }
                //Properties combinerProperties = PropertiesUtils.extractSelectedProperties(properties, passDownProperties);
                NERClassifierCombiner.Language nerLanguage = NERClassifierCombiner.Language.FromString(properties.GetProperty(prefix + "language"), NERClassifierCombiner.Language.English);
                nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, nerLanguage, useSUTime, applyRegexner, combinerProperties, models);
            }
            catch (IOException e)
            {
                throw new RuntimeIOException(e);
            }
            return(nerCombiner);
        }
 public virtual void SetUpEntityMentionBuilding(Properties properties)
 {
     this.buildEntityMentions = PropertiesUtils.GetBool(properties, "ner.buildEntityMentions", true);
     if (this.buildEntityMentions)
     {
         string     entityMentionsPrefix = "ner.entitymentions";
         Properties entityMentionsProps  = PropertiesUtils.ExtractPrefixedProperties(properties, entityMentionsPrefix + ".", true);
         // pass language info to the entity mention annotator
         entityMentionsProps.SetProperty("ner.entitymentions.language", language.ToString());
         entityMentionsAnnotator = new EntityMentionsAnnotator(entityMentionsPrefix, entityMentionsProps);
     }
 }
        public ChineseSegmenterAnnotator(string name, Properties props)
        {
            string model = null;
            // Keep only the properties that apply to this annotator
            Properties modelProps = new Properties();
            string     desiredKey = name + '.';

            foreach (string key in props.StringPropertyNames())
            {
                if (key.StartsWith(desiredKey))
                {
                    // skip past name and the subsequent "."
                    string modelKey = Sharpen.Runtime.Substring(key, desiredKey.Length);
                    if (modelKey.Equals("model"))
                    {
                        model = props.GetProperty(key);
                    }
                    else
                    {
                        modelProps.SetProperty(modelKey, props.GetProperty(key));
                    }
                }
            }
            this.Verbose        = PropertiesUtils.GetBool(props, name + ".verbose", false);
            this.normalizeSpace = PropertiesUtils.GetBool(props, name + ".normalizeSpace", false);
            if (model == null)
            {
                throw new Exception("Expected a property " + name + ".model");
            }
            // don't write very much, because the CRFClassifier already reports loading
            if (Verbose)
            {
                log.Info("Loading Segmentation Model ... ");
            }
            try
            {
                segmenter = CRFClassifier.GetClassifier(model, modelProps);
            }
            catch (Exception e)
            {
                throw;
            }
            catch (Exception e)
            {
                throw new Exception(e);
            }
            // If newlines are treated as sentence split, we need to retain them in tokenization for ssplit to make use of them
            tokenizeNewline = (!props.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, "never").Equals("never")) || bool.ValueOf(props.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false"));
            // record whether or not sentence splitting on two newlines ; if so, need to remove single newlines
            sentenceSplitOnTwoNewlines = props.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, "never").Equals("two");
        }
예제 #17
0
        public QuoteAttributionAnnotator(Properties props)
        {
            // settings
            // these paths go in the props file
            // fields
            Verbose = PropertiesUtils.GetBool(props, "verbose", false);
            Timing timer = null;

            CorefPath = props.GetProperty("booknlpCoref", null);
            if (CorefPath == null && Verbose)
            {
                log.Err("Warning: no coreference map!");
            }
            ModelPath      = props.GetProperty("modelPath", DefaultModelPath);
            CharactersFile = props.GetProperty("charactersPath", null);
            if (CharactersFile == null && Verbose)
            {
                log.Err("Warning: no characters file!");
            }
            qmSieveList = props.GetProperty("QMSieves", DefaultQmsieves);
            msSieveList = props.GetProperty("MSSieves", DefaultMssieves);
            if (Verbose)
            {
                timer = new Timing();
                log.Info("Loading QuoteAttribution coref [" + CorefPath + "]...");
                log.Info("Loading QuoteAttribution characters [" + CharactersFile + "]...");
            }
            // loading all our word lists
            FamilyWordList  = props.GetProperty("familyWordsFile", FamilyWordList);
            AnimacyWordList = props.GetProperty("animacyWordsFile", AnimacyWordList);
            GenderWordList  = props.GetProperty("genderNamesFile", GenderWordList);
            familyRelations = QuoteAttributionUtils.ReadFamilyRelations(FamilyWordList);
            genderMap       = QuoteAttributionUtils.ReadGenderedNounList(GenderWordList);
            animacyList     = QuoteAttributionUtils.ReadAnimacyList(AnimacyWordList);
            if (characterMap != null)
            {
                characterMap = QuoteAttributionUtils.ReadPersonMap(CharactersFile);
            }
            else
            {
                buildCharacterMapPerAnnotation = true;
            }
            // use Stanford CoreNLP coref to map mentions to canonical mentions
            useCoref = PropertiesUtils.GetBool(props, "useCoref", useCoref);
            if (Verbose)
            {
                timer.Stop("done.");
            }
        }
예제 #18
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, argOptionDefs);

            if (!options.Contains(string.Empty) || options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            bool retainNER = PropertiesUtils.GetBool(options, "ner", false);
            bool normalize = PropertiesUtils.GetBool(options, "normalize", true);
            File treeFile  = new File(options.GetProperty(string.Empty));
            TwoDimensionalCounter <string, string> labelTerm     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> termLabel     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> labelPreterm  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> pretermLabel  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>();

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf = new SpanishTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                for (Tree t; (t = tr.ReadTree()) != null;)
                {
                    UpdateTagger(unigramTagger, t);
                }
                tr.Close();
                //Closes the underlying reader
                System.Console.Out.WriteLine("Resolving DUMMY tags");
                ResolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null);
                System.Console.Out.WriteLine("#Unknown Word Types: " + MultiWordPreprocessor.ManualUWModel.nUnknownWordTypes);
                System.Console.Out.WriteLine(string.Format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double)nFixedPOS / nMissingPOS * 100));
                System.Console.Out.WriteLine(string.Format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double)nFixedPhrasal / nMissingPhrasal * 100));
                System.Console.Out.WriteLine("Done!");
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        public POSTaggerAnnotator(string annotatorName, Properties props)
        {
            string posLoc = props.GetProperty(annotatorName + ".model");

            if (posLoc == null)
            {
                posLoc = DefaultPaths.DefaultPosModel;
            }
            bool verbose = PropertiesUtils.GetBool(props, annotatorName + ".verbose", false);

            this.pos = LoadModel(posLoc, verbose);
            this.maxSentenceLength = PropertiesUtils.GetInt(props, annotatorName + ".maxlen", int.MaxValue);
            this.nThreads          = PropertiesUtils.GetInt(props, annotatorName + ".nthreads", PropertiesUtils.GetInt(props, "nthreads", 1));
            this.reuseTags         = PropertiesUtils.GetBool(props, annotatorName + ".reuseTags", false);
        }
        /// <summary>
        /// Use
        /// <see cref="Edu.Stanford.Nlp.Trees.International.Spanish.SpanishXMLTreeReader"/>
        /// to load the trees from the provided files,
        /// and begin collecting some statistics to be used in later MWE cleanup.
        /// NB: Much of the important cleanup happens implicitly here; the XML tree reader triggers the
        /// tree normalization routine.
        /// </summary>
        /// <exception cref="System.Exception"/>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Java.Util.Concurrent.ExecutionException"/>
        private IList <Tree> LoadTrees()
        {
            bool   ner      = PropertiesUtils.GetBool(options, "ner", false);
            string encoding = new SpanishTreebankLanguagePack().GetEncoding();
            SpanishXMLTreeReaderFactory trf = new SpanishXMLTreeReaderFactory(true, true, ner, false);
            IList <Tree> trees = new List <Tree>();

            foreach (File file in inputFiles)
            {
                Pair <TwoDimensionalCounter <string, string>, IList <Tree> > ret = ProcessTreeFile(file, trf, encoding);
                Counters.AddInPlace(unigramTagger, ret.First());
                Sharpen.Collections.AddAll(trees, ret.Second());
            }
            return(trees);
        }
        public QuantifiableEntityNormalizingAnnotator(string name, Properties props)
        {
            // TODO: collpase = true won't work properly (see annotateTokens)
            string property         = name + "." + BackgroundSymbolProperty;
            string backgroundSymbol = props.GetProperty(property, DefaultBackgroundSymbol);

            // this next line is yuck as QuantifiableEntityNormalizer is still static
            QuantifiableEntityNormalizer.BackgroundSymbol = backgroundSymbol;
            property = name + "." + CollapseProperty;
            collapse = PropertiesUtils.GetBool(props, property, false);
            if (this.collapse)
            {
                log.Info("WARNING: QuantifiableEntityNormalizingAnnotator does not work well with collapse=true");
            }
            Verbose = false;
        }
예제 #22
0
        private void SetProperties(Properties props)
        {
            trainingThreads       = PropertiesUtils.GetInt(props, "trainingThreads", trainingThreads);
            wordCutOff            = PropertiesUtils.GetInt(props, "wordCutOff", wordCutOff);
            initRange             = PropertiesUtils.GetDouble(props, "initRange", initRange);
            maxIter               = PropertiesUtils.GetInt(props, "maxIter", maxIter);
            batchSize             = PropertiesUtils.GetInt(props, "batchSize", batchSize);
            adaEps                = PropertiesUtils.GetDouble(props, "adaEps", adaEps);
            adaAlpha              = PropertiesUtils.GetDouble(props, "adaAlpha", adaAlpha);
            regParameter          = PropertiesUtils.GetDouble(props, "regParameter", regParameter);
            dropProb              = PropertiesUtils.GetDouble(props, "dropProb", dropProb);
            hiddenSize            = PropertiesUtils.GetInt(props, "hiddenSize", hiddenSize);
            embeddingSize         = PropertiesUtils.GetInt(props, "embeddingSize", embeddingSize);
            numPreComputed        = PropertiesUtils.GetInt(props, "numPreComputed", numPreComputed);
            evalPerIter           = PropertiesUtils.GetInt(props, "evalPerIter", evalPerIter);
            clearGradientsPerIter = PropertiesUtils.GetInt(props, "clearGradientsPerIter", clearGradientsPerIter);
            saveIntermediate      = PropertiesUtils.GetBool(props, "saveIntermediate", saveIntermediate);
            unlabeled             = PropertiesUtils.GetBool(props, "unlabeled", unlabeled);
            cPOS   = PropertiesUtils.GetBool(props, "cPOS", cPOS);
            noPunc = PropertiesUtils.GetBool(props, "noPunc", noPunc);
            doWordEmbeddingGradUpdate = PropertiesUtils.GetBool(props, "doWordEmbeddingGradUpdate", doWordEmbeddingGradUpdate);
            // Runtime parsing options
            sentenceDelimiter = PropertiesUtils.GetString(props, "sentenceDelimiter", sentenceDelimiter);
            tagger            = PropertiesUtils.GetString(props, "tagger.model", tagger);
            string escaperClass = props.GetProperty("escaper");

            escaper = escaperClass != null?ReflectionLoading.LoadByReflection(escaperClass) : null;

            // Language options
            language = props.Contains("language") ? GetLanguage(props.GetProperty("language")) : language;
            tlp      = [email protected]();
            // if a tlp was specified go with that
            string tlpCanonicalName = props.GetProperty("tlp");

            if (tlpCanonicalName != null)
            {
                try
                {
                    tlp = ReflectionLoading.LoadByReflection(tlpCanonicalName);
                    System.Console.Error.WriteLine("Loaded TreebankLanguagePack: " + tlpCanonicalName);
                }
                catch (Exception)
                {
                    System.Console.Error.WriteLine("Error: Failed to load TreebankLanguagePack: " + tlpCanonicalName);
                }
            }
        }
 public virtual void SetUpFineGrainedNER(Properties properties)
 {
     // set up fine grained ner
     this.applyFineGrained = PropertiesUtils.GetBool(properties, "ner.applyFineGrained", true);
     if (this.applyFineGrained)
     {
         string     fineGrainedPrefix = "ner.fine.regexner";
         Properties fineGrainedProps  = PropertiesUtils.ExtractPrefixedProperties(properties, fineGrainedPrefix + ".", true);
         // explicity set fine grained ner default here
         if (!fineGrainedProps.Contains("ner.fine.regexner.mapping"))
         {
             fineGrainedProps["ner.fine.regexner.mapping"] = DefaultPaths.DefaultKbpTokensregexNerSettings;
         }
         // build the fine grained ner TokensRegexNERAnnotator
         fineGrainedNERAnnotator = new TokensRegexNERAnnotator(fineGrainedPrefix, fineGrainedProps);
     }
 }
예제 #24
0
 /// <exception cref="System.IO.IOException"/>
 public NERClassifierCombiner(Properties props)
     : base(props)
 {
     // todo [cdm 2015]: Could avoid constructing this if applyNumericClassifiers is false
     applyNumericClassifiers = PropertiesUtils.GetBool(props, ApplyNumericClassifiersProperty, ApplyNumericClassifiersDefault);
     nerLanguage             = NERClassifierCombiner.Language.FromString(PropertiesUtils.GetString(props, NerLanguageProperty, null), NerLanguageDefault);
     useSUTime = PropertiesUtils.GetBool(props, NumberSequenceClassifier.UseSutimeProperty, NumberSequenceClassifier.UseSutimeDefault);
     nsc       = new NumberSequenceClassifier(new Properties(), useSUTime, props);
     if (PropertiesUtils.GetBool(props, NERClassifierCombiner.ApplyGazetteProperty, NERClassifierCombiner.ApplyGazetteDefault))
     {
         this.gazetteMapping = ReadRegexnerGazette(DefaultPaths.DefaultNerGazetteMapping);
     }
     else
     {
         this.gazetteMapping = Java.Util.Collections.EmptyMap();
     }
 }
 /// <exception cref="System.IO.IOException"/>
 private void LoadClassifiers(Properties props, IList <string> paths)
 {
     baseClassifiers = new List <AbstractSequenceClassifier <IN> >();
     if (PropertiesUtils.GetBool(props, "ner.usePresetNERTags", false))
     {
         AbstractSequenceClassifier <IN> presetASC = new PresetSequenceClassifier(props);
         baseClassifiers.Add(presetASC);
     }
     foreach (string path in paths)
     {
         AbstractSequenceClassifier <IN> cls = LoadClassifierFromPath(props, path);
         baseClassifiers.Add(cls);
     }
     if (baseClassifiers.Count > 0)
     {
         flags.backgroundSymbol = baseClassifiers[0].flags.backgroundSymbol;
     }
 }
예제 #26
0
        public static void Main(string[] args)
        {
            Properties config = StringUtils.ArgsToProperties(args);

            log.Info(config);
            bool             fullSentence = PropertiesUtils.GetBool(config, "fullSentence", false);
            Random           random       = new Random();
            string           tagSeparator = config.GetProperty("tagSeparator", TaggerConfig.TagSeparator);
            TaggedFileRecord record       = TaggedFileRecord.CreateRecord(config, config.GetProperty("input"));

            foreach (IList <TaggedWord> sentence in record.Reader())
            {
                int len = random.NextInt(sentence.Count) + 1;
                System.Console.Out.WriteLine(SentenceUtils.ListToString(sentence.SubList(0, len), false, tagSeparator));
                if (fullSentence)
                {
                    System.Console.Out.WriteLine(SentenceUtils.ListToString(sentence, false, tagSeparator));
                }
            }
        }
예제 #27
0
        private static IDocReader GetDocumentReader(Properties props)
        {
            string corpusPath = CorefProperties.GetInputPath(props);

            if (corpusPath == null)
            {
                return(null);
            }
            CoNLLDocumentReader.Options options = new CoNLLDocumentReader.Options();
            if (!PropertiesUtils.GetBool(props, "coref.printConLLLoadingMessage", true))
            {
                options.printConLLLoadingMessage = false;
            }
            options.annotateTokenCoref = false;
            string conllFileFilter = props.GetProperty("coref.conllFileFilter", ".*_auto_conll$");

            options.SetFilter(conllFileFilter);
            options.lang = CorefProperties.GetLanguage(props);
            return(new CoNLLDocumentReader(corpusPath, options));
        }
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (args.Length < 1 || options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            Pattern posPattern          = options.Contains("searchPos") ? Pattern.Compile(options.GetProperty("searchPos")) : null;
            Pattern wordPattern         = options.Contains("searchWord") ? Pattern.Compile(options.GetProperty("searchWord")) : null;
            bool    plainPrint          = PropertiesUtils.GetBool(options, "plain", false);
            bool    ner                 = PropertiesUtils.GetBool(options, "ner", false);
            bool    detailedAnnotations = PropertiesUtils.GetBool(options, "detailedAnnotations", false);

            string[]     remainingArgs = options.GetProperty(string.Empty).Split(" ");
            IList <File> fileList      = new List <File>();

            foreach (string remainingArg in remainingArgs)
            {
                fileList.Add(new File(remainingArg));
            }
            SpanishXMLTreeReaderFactory trf  = new SpanishXMLTreeReaderFactory(true, true, ner, detailedAnnotations);
            IExecutorService            pool = Executors.NewFixedThreadPool(Runtime.GetRuntime().AvailableProcessors());

            foreach (File file in fileList)
            {
                pool.Execute(null);
            }
            pool.Shutdown();
            try
            {
                pool.AwaitTermination(long.MaxValue, TimeUnit.Nanoseconds);
            }
            catch (Exception e)
            {
                throw new RuntimeInterruptedException(e);
            }
        }
예제 #29
0
        public static string Signature(string annotatorName, Properties props)
        {
            StringBuilder os = new StringBuilder();

            os.Append(annotatorName + ".model:" + props.GetProperty(annotatorName + ".model", LexicalizedParser.DefaultParserLoc));
            os.Append(annotatorName + ".debug:" + props.GetProperty(annotatorName + ".debug", "false"));
            os.Append(annotatorName + ".flags:" + props.GetProperty(annotatorName + ".flags", string.Empty));
            os.Append(annotatorName + ".maxlen:" + props.GetProperty(annotatorName + ".maxlen", "-1"));
            os.Append(annotatorName + ".treemap:" + props.GetProperty(annotatorName + ".treemap", string.Empty));
            os.Append(annotatorName + ".maxtime:" + props.GetProperty(annotatorName + ".maxtime", "-1"));
            os.Append(annotatorName + ".originalDependencies:" + props.GetProperty(annotatorName + ".originalDependencies", "false"));
            os.Append(annotatorName + ".buildgraphs:" + props.GetProperty(annotatorName + ".buildgraphs", "true"));
            os.Append(annotatorName + ".nthreads:" + props.GetProperty(annotatorName + ".nthreads", props.GetProperty("nthreads", string.Empty)));
            os.Append(annotatorName + ".nosquash:" + props.GetProperty(annotatorName + ".nosquash", "false"));
            os.Append(annotatorName + ".keepPunct:" + props.GetProperty(annotatorName + ".keepPunct", "true"));
            os.Append(annotatorName + ".extradependencies:" + props.GetProperty(annotatorName + ".extradependencies", "NONE").ToLower());
            bool usesBinary      = StanfordCoreNLP.UsesBinaryTrees(props);
            bool saveBinaryTrees = PropertiesUtils.GetBool(props, annotatorName + ".binaryTrees", usesBinary);

            os.Append(annotatorName + ".binaryTrees:" + saveBinaryTrees);
            return(os.ToString());
        }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < MinArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            Properties options       = StringUtils.ArgsToProperties(args, OptArgDefs());
            string     splitPrefix   = options.GetProperty("s", null);
            bool       ShowWords     = PropertiesUtils.GetBool(options, "w", false);
            bool       pathsAreFiles = PropertiesUtils.GetBool(options, "f", false);
            bool       ShowOov       = PropertiesUtils.GetBool(options, "o", false);

            string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+");
            if (parsedArgs.Length != MinArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            Language       language    = Language.ValueOf(parsedArgs[0]);
            IList <string> corpusPaths = new List <string>(parsedArgs.Length - 1);

            for (int i = 1; i < parsedArgs.Length; ++i)
            {
                corpusPaths.Add(parsedArgs[i]);
            }
            ITreebankLangParserParams tlpp = language.@params;
            TreebankStats             cs   = new TreebankStats(language, corpusPaths, tlpp);

            if (splitPrefix != null)
            {
                if (!cs.UseSplit(splitPrefix))
                {
                    log.Info("Could not load split!");
                }
            }
            cs.Run(pathsAreFiles, ShowWords, ShowOov);
        }