/// <summary>A fast, rule-based tokenizer for Modern Standard French.</summary> /// <remarks> /// A fast, rule-based tokenizer for Modern Standard French. /// Performs punctuation splitting and light tokenization by default. /// <p> /// Currently, this tokenizer does not do line splitting. It assumes that the input /// file is delimited by the system line separator. The output will be equivalently /// delimited. /// </remarks> /// <param name="args"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs()); if (options.Contains("help")) { log.Info(Usage()); return; } // Lexer options ITokenizerFactory <CoreLabel> tf = options.Contains("ftb") ? FrenchTokenizer.FtbFactory() : FrenchTokenizer.Factory(); string orthoOptions = options.GetProperty("options", string.Empty); // When called from this main method, split on newline. No options for // more granular sentence splitting. orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs"; tf.SetOptions(orthoOptions); // Other options string encoding = options.GetProperty("encoding", "UTF-8"); bool toLower = PropertiesUtils.GetBool(options, "lowerCase", false); // Read the file from stdin int nLines = 0; int nTokens = 0; long startTime = Runtime.NanoTime(); try { ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding)); bool printSpace = false; while (tokenizer.MoveNext()) { ++nTokens; string word = tokenizer.Current.Word(); if (word.Equals(FrenchLexer.NewlineToken)) { ++nLines; printSpace = false; System.Console.Out.WriteLine(); } else { if (printSpace) { System.Console.Out.Write(" "); } string outputToken = toLower ? word.ToLower(Locale.French) : word; System.Console.Out.Write(outputToken); printSpace = true; } } } catch (UnsupportedEncodingException e) { log.Error(e); } long elapsedTime = Runtime.NanoTime() - startTime; double linesPerSec = (double)nLines / (elapsedTime / 1e9); System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec); }
public override bool SetOptions(Properties opts) { bool ret = base.SetOptions(opts); if (opts.Contains(ConfigParser.paramSplit)) { string splitFileName = opts.GetProperty(ConfigParser.paramSplit); splitSet = MakeSplitSet(splitFileName); } CcTagset = PropertiesUtils.GetBool(opts, ConfigParser.paramCCTagset, false); treebank = new MemoryTreebank(new FrenchXMLTreeReaderFactory(CcTagset), FrenchTreebankLanguagePack.FtbEncoding); if (lexMapper == null) { lexMapper = new DefaultMapper(); lexMapper.Setup(null, lexMapOptions.Split(",")); } if (pathsToMappings.Count != 0) { if (posMapper == null) { posMapper = new DefaultMapper(); } foreach (File path in pathsToMappings) { posMapper.Setup(path); } } return(ret); }
public WhitespaceTokenizerFactory(ILexedTokenFactory <T> factory, string options) { this.factory = factory; Properties prop = StringUtils.StringToProperties(options); this.tokenizeNLs = PropertiesUtils.GetBool(prop, "tokenizeNLs", false); }
public CorefAnnotator(Properties props) { this.props = props; try { // if user tries to run with coref.language = ENGLISH and coref.algorithm = hybrid, throw Exception // we do not support those settings at this time if (CorefProperties.Algorithm(props).Equals(CorefProperties.CorefAlgorithmType.Hybrid) && CorefProperties.GetLanguage(props).Equals(Locale.English)) { log.Error("Error: coref.algorithm=hybrid is not supported for English, " + "please change coref.algorithm or coref.language"); throw new Exception(); } // suppress props.SetProperty("coref.printConLLLoadingMessage", "false"); corefSystem = new CorefSystem(props); props.Remove("coref.printConLLLoadingMessage"); } catch (Exception e) { log.Error("Error creating CorefAnnotator...terminating pipeline construction!"); log.Error(e); throw new Exception(e); } // unless custom mention detection is set, just use the default coref mention detector performMentionDetection = !PropertiesUtils.GetBool(props, "coref.useCustomMentionDetection", false); if (performMentionDetection) { mentionAnnotator = new CorefMentionAnnotator(props); } }
public ArabicSegmenterAnnotator(string name, Properties props) { string model = null; // Keep only the properties that apply to this annotator Properties modelProps = new Properties(); string desiredKey = name + '.'; foreach (string key in props.StringPropertyNames()) { if (key.StartsWith(desiredKey)) { // skip past name and the subsequent "." string modelKey = Sharpen.Runtime.Substring(key, desiredKey.Length); if (modelKey.Equals("model")) { model = props.GetProperty(key); } else { modelProps.SetProperty(modelKey, props.GetProperty(key)); } } } this.Verbose = PropertiesUtils.GetBool(props, name + ".verbose", false); if (model == null) { throw new Exception("Expected a property " + name + ".model"); } LoadModel(model, modelProps); }
public virtual void Init(string name, Properties props) { string prefix = (name == null) ? string.Empty : name + "."; string delimiterRegex = props.GetProperty(prefix + "delimiter"); if (delimiterRegex != null) { delimiterPattern = Pattern.Compile(delimiterRegex); } replaceWhitespace = PropertiesUtils.GetBool(props, prefix + "replaceWhitespace", replaceWhitespace); string mapString = props.GetProperty(prefix + "columns"); tokensAnnotationClassName = props.GetProperty(prefix + "tokens", "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation"); string tokenFactoryClassName = props.GetProperty(prefix + "tokenFactory"); if (tokenFactoryClassName != null) { try { this.tokenFactory = (ICoreTokenFactory <IN>)System.Activator.CreateInstance(Sharpen.Runtime.GetType(tokenFactoryClassName)); } catch (Exception e) { throw new Exception(e); } } else { this.tokenFactory = (ICoreTokenFactory <IN>) new CoreLabelTokenFactory(); } Init(mapString, this.tokenFactory, this.tokensAnnotationClassName); }
public virtual ITokenizer <T> GetTokenizer(Reader r, string extraOptions) { Properties prop = StringUtils.StringToProperties(extraOptions); bool tokenizeNewlines = PropertiesUtils.GetBool(prop, "tokenizeNLs", this.tokenizeNLs); return(new WhitespaceTokenizer <T>(factory, r, tokenizeNewlines)); }
/// <summary> /// Fix tree structure, phrasal categories and part-of-speech labels in newly expanded /// multi-word tokens. /// </summary> /// <exception cref="System.Exception"/> /// <exception cref="Java.Util.Concurrent.ExecutionException"/> private IList <Tree> FixMultiWordTokens(IList <Tree> trees) { bool ner = PropertiesUtils.GetBool(options, "ner", false); // Shared resources IFactory <TreeNormalizer> tnf = new _IFactory_389(); ITreeFactory tf = new LabeledScoredTreeFactory(); IThreadsafeProcessor <ICollection <Tree>, ICollection <Tree> > processor = new AnCoraProcessor.MultiWordProcessor(this, tnf, tf, ner); int availableProcessors = Runtime.GetRuntime().AvailableProcessors(); MulticoreWrapper <ICollection <Tree>, ICollection <Tree> > wrapper = new MulticoreWrapper <ICollection <Tree>, ICollection <Tree> >(availableProcessors, processor, false); // Chunk our work so that parallelization is actually worth it int numChunks = availableProcessors * 20; IList <IList <Tree> > chunked = CollectionUtils.PartitionIntoFolds(trees, numChunks); IList <Tree> ret = new List <Tree>(); foreach (ICollection <Tree> coll in chunked) { wrapper.Put(coll); while (wrapper.Peek()) { Sharpen.Collections.AddAll(ret, wrapper.Poll()); } } wrapper.Join(); while (wrapper.Peek()) { Sharpen.Collections.AddAll(ret, wrapper.Poll()); } return(ret); }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, ArgDefs()); Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); ITreebankLangParserParams tlpp = language.@params; DiskTreebank tb = null; string encoding = options.GetProperty("l", "UTF-8"); bool removeBracket = PropertiesUtils.GetBool(options, "b", false); tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); tb = tlpp.DiskTreebank(); string[] files = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (files.Length != 0) { foreach (string filename in files) { tb.LoadPath(filename); } } else { log.Info(Usage()); System.Environment.Exit(-1); } PrintWriter pwo = tlpp.Pw(); string startSymbol = tlpp.TreebankLanguagePack().StartSymbol(); ITreeFactory tf = new LabeledScoredTreeFactory(); int nTrees = 0; foreach (Tree t in tb) { if (removeBracket) { if (t.Value().Equals(startSymbol)) { t = t.FirstChild(); } } else { if (!t.Value().Equals(startSymbol)) { //Add a bracket if it isn't already there t = tf.NewTreeNode(startSymbol, Java.Util.Collections.SingletonList(t)); } } pwo.Println(t.ToString()); nTrees++; } pwo.Close(); System.Console.Error.Printf("Processed %d trees.%n", nTrees); }
public Options(string name, Properties props) { includeRange = PropertiesUtils.GetBool(props, name + ".includeRange", includeRange); markTimeRanges = PropertiesUtils.GetBool(props, name + ".markTimeRanges", markTimeRanges); includeNested = PropertiesUtils.GetBool(props, name + ".includeNested", includeNested); restrictToTimex3 = PropertiesUtils.GetBool(props, name + ".restrictToTimex3", restrictToTimex3); teRelHeurLevel = Options.RelativeHeuristicLevel.ValueOf(props.GetProperty(name + ".teRelHeurLevel", teRelHeurLevel.ToString())); verbose = PropertiesUtils.GetBool(props, name + ".verbose", verbose); // set default rules by SUTime language language = props.GetProperty(name + ".language", language); if (!languageToRulesFiles.Keys.Contains(language)) { language = "english"; } grammarFilename = languageToRulesFiles[language]; // override if rules are set by properties grammarFilename = props.GetProperty(name + ".rules", grammarFilename); searchForDocDate = PropertiesUtils.GetBool(props, name + ".searchForDocDate", searchForDocDate); string binderProperty = props.GetProperty(name + ".binders"); int nBinders; string[] binderClasses; if (binderProperty == null) { nBinders = DefaultBinders.Length; binderClasses = DefaultBinders; } else { nBinders = PropertiesUtils.GetInt(props, name + ".binders", 0); binderClasses = new string[nBinders]; for (int i = 0; i < nBinders; ++i) { string binderPrefix = name + ".binder." + (i + 1); binderClasses[i] = props.GetProperty(binderPrefix); } } if (nBinders > 0 && Runtime.GetProperty("STS") == null) { binders = new Env.IBinder[nBinders]; for (int i = 0; i < nBinders; i++) { int bi = i + 1; string binderPrefix = name + ".binder." + bi; try { Type binderClass = Sharpen.Runtime.GetType(binderClasses[i]); binderPrefix = binderPrefix + "."; binders[i] = (Env.IBinder)System.Activator.CreateInstance(binderClass); binders[i].Init(binderPrefix, props); } catch (Exception ex) { throw new Exception("Error initializing binder " + bi, ex); } } } }
public ParserAnnotator(string annotatorName, Properties props) { string model = props.GetProperty(annotatorName + ".model", LexicalizedParser.DefaultParserLoc); if (model == null) { throw new ArgumentException("No model specified for Parser annotator " + annotatorName); } this.Verbose = PropertiesUtils.GetBool(props, annotatorName + ".debug", false); string[] flags = ConvertFlagsToArray(props.GetProperty(annotatorName + ".flags")); this.parser = LoadModel(model, Verbose, flags); this.maxSentenceLength = PropertiesUtils.GetInt(props, annotatorName + ".maxlen", -1); string treeMapClass = props.GetProperty(annotatorName + ".treemap"); if (treeMapClass == null) { this.treeMap = null; } else { this.treeMap = ReflectionLoading.LoadByReflection(treeMapClass, props); } this.maxParseTime = PropertiesUtils.GetLong(props, annotatorName + ".maxtime", -1); this.kBest = PropertiesUtils.GetInt(props, annotatorName + ".kbest", 1); this.keepPunct = PropertiesUtils.GetBool(props, annotatorName + ".keepPunct", true); string buildGraphsProperty = annotatorName + ".buildgraphs"; if (!this.parser.GetTLPParams().SupportsBasicDependencies()) { if (PropertiesUtils.GetBool(props, buildGraphsProperty)) { log.Info("WARNING: " + buildGraphsProperty + " set to true, but " + this.parser.GetTLPParams().GetType() + " does not support dependencies"); } this.BuildGraphs = false; } else { this.BuildGraphs = PropertiesUtils.GetBool(props, buildGraphsProperty, true); } if (this.BuildGraphs) { bool generateOriginalDependencies = PropertiesUtils.GetBool(props, annotatorName + ".originalDependencies", false); parser.GetTLPParams().SetGenerateOriginalDependencies(generateOriginalDependencies); ITreebankLanguagePack tlp = parser.GetTLPParams().TreebankLanguagePack(); IPredicate <string> punctFilter = this.keepPunct ? Filters.AcceptFilter() : tlp.PunctuationWordRejectFilter(); this.gsf = tlp.GrammaticalStructureFactory(punctFilter, parser.GetTLPParams().TypedDependencyHeadFinder()); } else { this.gsf = null; } this.nThreads = PropertiesUtils.GetInt(props, annotatorName + ".nthreads", PropertiesUtils.GetInt(props, "nthreads", 1)); bool usesBinary = StanfordCoreNLP.UsesBinaryTrees(props); this.saveBinaryTrees = PropertiesUtils.GetBool(props, annotatorName + ".binaryTrees", usesBinary); this.noSquash = PropertiesUtils.GetBool(props, annotatorName + ".nosquash", false); this.extraDependencies = MetaClass.Cast(props.GetProperty(annotatorName + ".extradependencies", "NONE"), typeof(GrammaticalStructure.Extras)); }
public NumberAnnotator(string name, Properties props) { string property = name + "." + BackgroundSymbolProperty; BackgroundSymbol = props.GetProperty(property, DefaultBackgroundSymbol); bool useSUTime = PropertiesUtils.GetBool(props, NumberSequenceClassifier.UseSutimeProperty, NumberSequenceClassifier.UseSutimeDefault); Verbose = false; nsc = new NumberSequenceClassifier(useSUTime); }
public virtual ITokenizer <IHasWord> GetTokenizer(Reader r, string extraOptions) { bool tokenizeNewlines = this.tokenizeNLs; if (extraOptions != null) { Properties prop = StringUtils.StringToProperties(extraOptions); tokenizeNewlines = PropertiesUtils.GetBool(prop, "tokenizeNLs", this.tokenizeNLs); } return(new WordSegmentingTokenizer(segmenter, WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(r, tokenizeNewlines))); }
/// <summary> /// This factory method is used to create the NERClassifierCombiner used in NERCombinerAnnotator /// (and, thence, in StanfordCoreNLP). /// </summary> /// <param name="name"> /// A "x.y" format property name prefix (the "x" part). This is commonly null, /// and then "ner" is used. If it is the empty string, then no property prefix is used. /// </param> /// <param name="passDownProperties"> /// Property names for which the property should be passed down /// to the NERClassifierCombiner. The default is not to pass down, but pass down is /// useful for things like charset encoding. /// </param> /// <param name="properties"> /// Various properties, including a list in "ner.model". /// The used ones start with name + "." or are in passDownProperties /// </param> /// <returns>An NERClassifierCombiner with the given properties</returns> public static NERClassifierCombiner CreateNERClassifierCombiner(string name, ICollection <string> passDownProperties, Properties properties) { string prefix = (name == null) ? "ner." : name.IsEmpty() ? string.Empty : name + '.'; string modelNames = properties.GetProperty(prefix + "model"); if (modelNames == null) { modelNames = DefaultPaths.DefaultNerThreeclassModel + ',' + DefaultPaths.DefaultNerMucModel + ',' + DefaultPaths.DefaultNerConllModel; } // but modelNames can still be empty string is set explicitly to be empty! string[] models; if (!modelNames.IsEmpty()) { models = modelNames.Split(","); } else { // Allow for no real NER model - can just use numeric classifiers or SUTime log.Info("WARNING: no NER models specified"); models = StringUtils.EmptyStringArray; } NERClassifierCombiner nerCombiner; try { bool applyNumericClassifiers = PropertiesUtils.GetBool(properties, prefix + ApplyNumericClassifiersPropertyBase, ApplyNumericClassifiersDefault); bool useSUTime = PropertiesUtils.GetBool(properties, prefix + NumberSequenceClassifier.UseSutimePropertyBase, NumberSequenceClassifier.UseSutimeDefault); bool applyRegexner = PropertiesUtils.GetBool(properties, NERClassifierCombiner.ApplyGazetteProperty, NERClassifierCombiner.ApplyGazetteDefault); Properties combinerProperties; if (passDownProperties != null) { combinerProperties = PropertiesUtils.ExtractSelectedProperties(properties, passDownProperties); if (useSUTime) { // Make sure SUTime parameters are included Properties sutimeProps = PropertiesUtils.ExtractPrefixedProperties(properties, NumberSequenceClassifier.SutimeProperty + ".", true); PropertiesUtils.OverWriteProperties(combinerProperties, sutimeProps); } } else { // if passDownProperties is null, just pass everything through combinerProperties = properties; } //Properties combinerProperties = PropertiesUtils.extractSelectedProperties(properties, passDownProperties); NERClassifierCombiner.Language nerLanguage = NERClassifierCombiner.Language.FromString(properties.GetProperty(prefix + "language"), NERClassifierCombiner.Language.English); nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, nerLanguage, useSUTime, applyRegexner, combinerProperties, models); } catch (IOException e) { throw new RuntimeIOException(e); } return(nerCombiner); }
public virtual void SetUpEntityMentionBuilding(Properties properties) { this.buildEntityMentions = PropertiesUtils.GetBool(properties, "ner.buildEntityMentions", true); if (this.buildEntityMentions) { string entityMentionsPrefix = "ner.entitymentions"; Properties entityMentionsProps = PropertiesUtils.ExtractPrefixedProperties(properties, entityMentionsPrefix + ".", true); // pass language info to the entity mention annotator entityMentionsProps.SetProperty("ner.entitymentions.language", language.ToString()); entityMentionsAnnotator = new EntityMentionsAnnotator(entityMentionsPrefix, entityMentionsProps); } }
public ChineseSegmenterAnnotator(string name, Properties props) { string model = null; // Keep only the properties that apply to this annotator Properties modelProps = new Properties(); string desiredKey = name + '.'; foreach (string key in props.StringPropertyNames()) { if (key.StartsWith(desiredKey)) { // skip past name and the subsequent "." string modelKey = Sharpen.Runtime.Substring(key, desiredKey.Length); if (modelKey.Equals("model")) { model = props.GetProperty(key); } else { modelProps.SetProperty(modelKey, props.GetProperty(key)); } } } this.Verbose = PropertiesUtils.GetBool(props, name + ".verbose", false); this.normalizeSpace = PropertiesUtils.GetBool(props, name + ".normalizeSpace", false); if (model == null) { throw new Exception("Expected a property " + name + ".model"); } // don't write very much, because the CRFClassifier already reports loading if (Verbose) { log.Info("Loading Segmentation Model ... "); } try { segmenter = CRFClassifier.GetClassifier(model, modelProps); } catch (Exception e) { throw; } catch (Exception e) { throw new Exception(e); } // If newlines are treated as sentence split, we need to retain them in tokenization for ssplit to make use of them tokenizeNewline = (!props.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, "never").Equals("never")) || bool.ValueOf(props.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false")); // record whether or not sentence splitting on two newlines ; if so, need to remove single newlines sentenceSplitOnTwoNewlines = props.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, "never").Equals("two"); }
public QuoteAttributionAnnotator(Properties props) { // settings // these paths go in the props file // fields Verbose = PropertiesUtils.GetBool(props, "verbose", false); Timing timer = null; CorefPath = props.GetProperty("booknlpCoref", null); if (CorefPath == null && Verbose) { log.Err("Warning: no coreference map!"); } ModelPath = props.GetProperty("modelPath", DefaultModelPath); CharactersFile = props.GetProperty("charactersPath", null); if (CharactersFile == null && Verbose) { log.Err("Warning: no characters file!"); } qmSieveList = props.GetProperty("QMSieves", DefaultQmsieves); msSieveList = props.GetProperty("MSSieves", DefaultMssieves); if (Verbose) { timer = new Timing(); log.Info("Loading QuoteAttribution coref [" + CorefPath + "]..."); log.Info("Loading QuoteAttribution characters [" + CharactersFile + "]..."); } // loading all our word lists FamilyWordList = props.GetProperty("familyWordsFile", FamilyWordList); AnimacyWordList = props.GetProperty("animacyWordsFile", AnimacyWordList); GenderWordList = props.GetProperty("genderNamesFile", GenderWordList); familyRelations = QuoteAttributionUtils.ReadFamilyRelations(FamilyWordList); genderMap = QuoteAttributionUtils.ReadGenderedNounList(GenderWordList); animacyList = QuoteAttributionUtils.ReadAnimacyList(AnimacyWordList); if (characterMap != null) { characterMap = QuoteAttributionUtils.ReadPersonMap(CharactersFile); } else { buildCharacterMapPerAnnotation = true; } // use Stanford CoreNLP coref to map mentions to canonical mentions useCoref = PropertiesUtils.GetBool(props, "useCoref", useCoref); if (Verbose) { timer.Stop("done."); } }
/// <param name="args"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, argOptionDefs); if (!options.Contains(string.Empty) || options.Contains("help")) { log.Info(Usage()); return; } bool retainNER = PropertiesUtils.GetBool(options, "ner", false); bool normalize = PropertiesUtils.GetBool(options, "normalize", true); File treeFile = new File(options.GetProperty(string.Empty)); TwoDimensionalCounter <string, string> labelTerm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> termLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> labelPreterm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> pretermLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new SpanishTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); for (Tree t; (t = tr.ReadTree()) != null;) { UpdateTagger(unigramTagger, t); } tr.Close(); //Closes the underlying reader System.Console.Out.WriteLine("Resolving DUMMY tags"); ResolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null); System.Console.Out.WriteLine("#Unknown Word Types: " + MultiWordPreprocessor.ManualUWModel.nUnknownWordTypes); System.Console.Out.WriteLine(string.Format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double)nFixedPOS / nMissingPOS * 100)); System.Console.Out.WriteLine(string.Format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double)nFixedPhrasal / nMissingPhrasal * 100)); System.Console.Out.WriteLine("Done!"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
public POSTaggerAnnotator(string annotatorName, Properties props) { string posLoc = props.GetProperty(annotatorName + ".model"); if (posLoc == null) { posLoc = DefaultPaths.DefaultPosModel; } bool verbose = PropertiesUtils.GetBool(props, annotatorName + ".verbose", false); this.pos = LoadModel(posLoc, verbose); this.maxSentenceLength = PropertiesUtils.GetInt(props, annotatorName + ".maxlen", int.MaxValue); this.nThreads = PropertiesUtils.GetInt(props, annotatorName + ".nthreads", PropertiesUtils.GetInt(props, "nthreads", 1)); this.reuseTags = PropertiesUtils.GetBool(props, annotatorName + ".reuseTags", false); }
/// <summary> /// Use /// <see cref="Edu.Stanford.Nlp.Trees.International.Spanish.SpanishXMLTreeReader"/> /// to load the trees from the provided files, /// and begin collecting some statistics to be used in later MWE cleanup. /// NB: Much of the important cleanup happens implicitly here; the XML tree reader triggers the /// tree normalization routine. /// </summary> /// <exception cref="System.Exception"/> /// <exception cref="System.IO.IOException"/> /// <exception cref="Java.Util.Concurrent.ExecutionException"/> private IList <Tree> LoadTrees() { bool ner = PropertiesUtils.GetBool(options, "ner", false); string encoding = new SpanishTreebankLanguagePack().GetEncoding(); SpanishXMLTreeReaderFactory trf = new SpanishXMLTreeReaderFactory(true, true, ner, false); IList <Tree> trees = new List <Tree>(); foreach (File file in inputFiles) { Pair <TwoDimensionalCounter <string, string>, IList <Tree> > ret = ProcessTreeFile(file, trf, encoding); Counters.AddInPlace(unigramTagger, ret.First()); Sharpen.Collections.AddAll(trees, ret.Second()); } return(trees); }
public QuantifiableEntityNormalizingAnnotator(string name, Properties props) { // TODO: collpase = true won't work properly (see annotateTokens) string property = name + "." + BackgroundSymbolProperty; string backgroundSymbol = props.GetProperty(property, DefaultBackgroundSymbol); // this next line is yuck as QuantifiableEntityNormalizer is still static QuantifiableEntityNormalizer.BackgroundSymbol = backgroundSymbol; property = name + "." + CollapseProperty; collapse = PropertiesUtils.GetBool(props, property, false); if (this.collapse) { log.Info("WARNING: QuantifiableEntityNormalizingAnnotator does not work well with collapse=true"); } Verbose = false; }
private void SetProperties(Properties props) { trainingThreads = PropertiesUtils.GetInt(props, "trainingThreads", trainingThreads); wordCutOff = PropertiesUtils.GetInt(props, "wordCutOff", wordCutOff); initRange = PropertiesUtils.GetDouble(props, "initRange", initRange); maxIter = PropertiesUtils.GetInt(props, "maxIter", maxIter); batchSize = PropertiesUtils.GetInt(props, "batchSize", batchSize); adaEps = PropertiesUtils.GetDouble(props, "adaEps", adaEps); adaAlpha = PropertiesUtils.GetDouble(props, "adaAlpha", adaAlpha); regParameter = PropertiesUtils.GetDouble(props, "regParameter", regParameter); dropProb = PropertiesUtils.GetDouble(props, "dropProb", dropProb); hiddenSize = PropertiesUtils.GetInt(props, "hiddenSize", hiddenSize); embeddingSize = PropertiesUtils.GetInt(props, "embeddingSize", embeddingSize); numPreComputed = PropertiesUtils.GetInt(props, "numPreComputed", numPreComputed); evalPerIter = PropertiesUtils.GetInt(props, "evalPerIter", evalPerIter); clearGradientsPerIter = PropertiesUtils.GetInt(props, "clearGradientsPerIter", clearGradientsPerIter); saveIntermediate = PropertiesUtils.GetBool(props, "saveIntermediate", saveIntermediate); unlabeled = PropertiesUtils.GetBool(props, "unlabeled", unlabeled); cPOS = PropertiesUtils.GetBool(props, "cPOS", cPOS); noPunc = PropertiesUtils.GetBool(props, "noPunc", noPunc); doWordEmbeddingGradUpdate = PropertiesUtils.GetBool(props, "doWordEmbeddingGradUpdate", doWordEmbeddingGradUpdate); // Runtime parsing options sentenceDelimiter = PropertiesUtils.GetString(props, "sentenceDelimiter", sentenceDelimiter); tagger = PropertiesUtils.GetString(props, "tagger.model", tagger); string escaperClass = props.GetProperty("escaper"); escaper = escaperClass != null?ReflectionLoading.LoadByReflection(escaperClass) : null; // Language options language = props.Contains("language") ? GetLanguage(props.GetProperty("language")) : language; tlp = [email protected](); // if a tlp was specified go with that string tlpCanonicalName = props.GetProperty("tlp"); if (tlpCanonicalName != null) { try { tlp = ReflectionLoading.LoadByReflection(tlpCanonicalName); System.Console.Error.WriteLine("Loaded TreebankLanguagePack: " + tlpCanonicalName); } catch (Exception) { System.Console.Error.WriteLine("Error: Failed to load TreebankLanguagePack: " + tlpCanonicalName); } } }
public virtual void SetUpFineGrainedNER(Properties properties) { // set up fine grained ner this.applyFineGrained = PropertiesUtils.GetBool(properties, "ner.applyFineGrained", true); if (this.applyFineGrained) { string fineGrainedPrefix = "ner.fine.regexner"; Properties fineGrainedProps = PropertiesUtils.ExtractPrefixedProperties(properties, fineGrainedPrefix + ".", true); // explicity set fine grained ner default here if (!fineGrainedProps.Contains("ner.fine.regexner.mapping")) { fineGrainedProps["ner.fine.regexner.mapping"] = DefaultPaths.DefaultKbpTokensregexNerSettings; } // build the fine grained ner TokensRegexNERAnnotator fineGrainedNERAnnotator = new TokensRegexNERAnnotator(fineGrainedPrefix, fineGrainedProps); } }
/// <exception cref="System.IO.IOException"/> public NERClassifierCombiner(Properties props) : base(props) { // todo [cdm 2015]: Could avoid constructing this if applyNumericClassifiers is false applyNumericClassifiers = PropertiesUtils.GetBool(props, ApplyNumericClassifiersProperty, ApplyNumericClassifiersDefault); nerLanguage = NERClassifierCombiner.Language.FromString(PropertiesUtils.GetString(props, NerLanguageProperty, null), NerLanguageDefault); useSUTime = PropertiesUtils.GetBool(props, NumberSequenceClassifier.UseSutimeProperty, NumberSequenceClassifier.UseSutimeDefault); nsc = new NumberSequenceClassifier(new Properties(), useSUTime, props); if (PropertiesUtils.GetBool(props, NERClassifierCombiner.ApplyGazetteProperty, NERClassifierCombiner.ApplyGazetteDefault)) { this.gazetteMapping = ReadRegexnerGazette(DefaultPaths.DefaultNerGazetteMapping); } else { this.gazetteMapping = Java.Util.Collections.EmptyMap(); } }
/// <exception cref="System.IO.IOException"/> private void LoadClassifiers(Properties props, IList <string> paths) { baseClassifiers = new List <AbstractSequenceClassifier <IN> >(); if (PropertiesUtils.GetBool(props, "ner.usePresetNERTags", false)) { AbstractSequenceClassifier <IN> presetASC = new PresetSequenceClassifier(props); baseClassifiers.Add(presetASC); } foreach (string path in paths) { AbstractSequenceClassifier <IN> cls = LoadClassifierFromPath(props, path); baseClassifiers.Add(cls); } if (baseClassifiers.Count > 0) { flags.backgroundSymbol = baseClassifiers[0].flags.backgroundSymbol; } }
public static void Main(string[] args) { Properties config = StringUtils.ArgsToProperties(args); log.Info(config); bool fullSentence = PropertiesUtils.GetBool(config, "fullSentence", false); Random random = new Random(); string tagSeparator = config.GetProperty("tagSeparator", TaggerConfig.TagSeparator); TaggedFileRecord record = TaggedFileRecord.CreateRecord(config, config.GetProperty("input")); foreach (IList <TaggedWord> sentence in record.Reader()) { int len = random.NextInt(sentence.Count) + 1; System.Console.Out.WriteLine(SentenceUtils.ListToString(sentence.SubList(0, len), false, tagSeparator)); if (fullSentence) { System.Console.Out.WriteLine(SentenceUtils.ListToString(sentence, false, tagSeparator)); } } }
private static IDocReader GetDocumentReader(Properties props) { string corpusPath = CorefProperties.GetInputPath(props); if (corpusPath == null) { return(null); } CoNLLDocumentReader.Options options = new CoNLLDocumentReader.Options(); if (!PropertiesUtils.GetBool(props, "coref.printConLLLoadingMessage", true)) { options.printConLLLoadingMessage = false; } options.annotateTokenCoref = false; string conllFileFilter = props.GetProperty("coref.conllFileFilter", ".*_auto_conll$"); options.SetFilter(conllFileFilter); options.lang = CorefProperties.GetLanguage(props); return(new CoNLLDocumentReader(corpusPath, options)); }
public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs()); if (args.Length < 1 || options.Contains("help")) { log.Info(Usage()); return; } Pattern posPattern = options.Contains("searchPos") ? Pattern.Compile(options.GetProperty("searchPos")) : null; Pattern wordPattern = options.Contains("searchWord") ? Pattern.Compile(options.GetProperty("searchWord")) : null; bool plainPrint = PropertiesUtils.GetBool(options, "plain", false); bool ner = PropertiesUtils.GetBool(options, "ner", false); bool detailedAnnotations = PropertiesUtils.GetBool(options, "detailedAnnotations", false); string[] remainingArgs = options.GetProperty(string.Empty).Split(" "); IList <File> fileList = new List <File>(); foreach (string remainingArg in remainingArgs) { fileList.Add(new File(remainingArg)); } SpanishXMLTreeReaderFactory trf = new SpanishXMLTreeReaderFactory(true, true, ner, detailedAnnotations); IExecutorService pool = Executors.NewFixedThreadPool(Runtime.GetRuntime().AvailableProcessors()); foreach (File file in fileList) { pool.Execute(null); } pool.Shutdown(); try { pool.AwaitTermination(long.MaxValue, TimeUnit.Nanoseconds); } catch (Exception e) { throw new RuntimeInterruptedException(e); } }
public static string Signature(string annotatorName, Properties props) { StringBuilder os = new StringBuilder(); os.Append(annotatorName + ".model:" + props.GetProperty(annotatorName + ".model", LexicalizedParser.DefaultParserLoc)); os.Append(annotatorName + ".debug:" + props.GetProperty(annotatorName + ".debug", "false")); os.Append(annotatorName + ".flags:" + props.GetProperty(annotatorName + ".flags", string.Empty)); os.Append(annotatorName + ".maxlen:" + props.GetProperty(annotatorName + ".maxlen", "-1")); os.Append(annotatorName + ".treemap:" + props.GetProperty(annotatorName + ".treemap", string.Empty)); os.Append(annotatorName + ".maxtime:" + props.GetProperty(annotatorName + ".maxtime", "-1")); os.Append(annotatorName + ".originalDependencies:" + props.GetProperty(annotatorName + ".originalDependencies", "false")); os.Append(annotatorName + ".buildgraphs:" + props.GetProperty(annotatorName + ".buildgraphs", "true")); os.Append(annotatorName + ".nthreads:" + props.GetProperty(annotatorName + ".nthreads", props.GetProperty("nthreads", string.Empty))); os.Append(annotatorName + ".nosquash:" + props.GetProperty(annotatorName + ".nosquash", "false")); os.Append(annotatorName + ".keepPunct:" + props.GetProperty(annotatorName + ".keepPunct", "true")); os.Append(annotatorName + ".extradependencies:" + props.GetProperty(annotatorName + ".extradependencies", "NONE").ToLower()); bool usesBinary = StanfordCoreNLP.UsesBinaryTrees(props); bool saveBinaryTrees = PropertiesUtils.GetBool(props, annotatorName + ".binaryTrees", usesBinary); os.Append(annotatorName + ".binaryTrees:" + saveBinaryTrees); return(os.ToString()); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length < MinArgs) { log.Info(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, OptArgDefs()); string splitPrefix = options.GetProperty("s", null); bool ShowWords = PropertiesUtils.GetBool(options, "w", false); bool pathsAreFiles = PropertiesUtils.GetBool(options, "f", false); bool ShowOov = PropertiesUtils.GetBool(options, "o", false); string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (parsedArgs.Length != MinArgs) { log.Info(Usage()); System.Environment.Exit(-1); } Language language = Language.ValueOf(parsedArgs[0]); IList <string> corpusPaths = new List <string>(parsedArgs.Length - 1); for (int i = 1; i < parsedArgs.Length; ++i) { corpusPaths.Add(parsedArgs[i]); } ITreebankLangParserParams tlpp = language.@params; TreebankStats cs = new TreebankStats(language, corpusPaths, tlpp); if (splitPrefix != null) { if (!cs.UseSplit(splitPrefix)) { log.Info("Could not load split!"); } } cs.Run(pathsAreFiles, ShowWords, ShowOov); }