private void InitEnv() { env = TokenSequencePattern.GetNewEnv(); env.SetDefaultTokensAnnotationKey(typeof(CoreAnnotations.NumerizedTokensAnnotation)); // Do case insensitive matching env.SetDefaultStringMatchFlags(Pattern.CaseInsensitive | Pattern.UnicodeCase); env.SetDefaultStringPatternFlags(Pattern.CaseInsensitive | Pattern.UnicodeCase); try { Units.RegisterUnits(env, options.unitsFilename); } catch (IOException ex) { throw new Exception("Error loading units from " + options.unitsFilename, ex); } try { UnitPrefix.RegisterPrefixes(env, options.prefixFilename); } catch (IOException ex) { throw new Exception("Error loading prefixes from " + options.prefixFilename, ex); } env.Bind("options", options); env.Bind("numcomptype", typeof(CoreAnnotations.NumericCompositeTypeAnnotation)); env.Bind("numcompvalue", typeof(CoreAnnotations.NumericCompositeValueAnnotation)); }
public TokenSequencePattern Build() { var res = new TokenSequencePattern(Seq); Reset(); return(res); }
public KBPTokensregexExtractor(string tokensregexDir, bool verbose) { if (verbose) { logger.Log("Creating TokensRegexExtractor"); } // Create extractors foreach (KBPRelationExtractor.RelationType rel in KBPRelationExtractor.RelationType.Values()) { string relFileNameComponent = rel.canonicalName.ReplaceAll(":", "_"); string path = tokensregexDir + File.separator + relFileNameComponent.ReplaceAll("/", "SLASH") + ".rules"; if (IOUtils.ExistsInClasspathOrFileSystem(path)) { IList <string> listFiles = new List <string>(); listFiles.Add(tokensregexDir + File.separator + "defs.rules"); listFiles.Add(path); if (verbose) { logger.Log("Rule files for relation " + rel + " is " + path); } Env env = TokenSequencePattern.GetNewEnv(); env.Bind("collapseExtractionRules", true); env.Bind("verbose", verbose); CoreMapExpressionExtractor extr = CoreMapExpressionExtractor.CreateExtractorFromFiles(env, listFiles).KeepTemporaryTags(); rules[rel] = extr; } } }
// static demo class /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { if (args.Length < 2) { System.Console.Error.WriteLine("TokensRegexMatcher rules file [outFile]"); return; } string rules = args[0]; PrintWriter @out; if (args.Length > 2) { @out = new PrintWriter(args[2]); } else { @out = new PrintWriter(System.Console.Out); } StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner")); Annotation annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1])); pipeline.Annotate(annotation); // Load lines of file as TokenSequencePatterns IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>(); foreach (string line in ObjectBank.GetLineIterator(rules)) { TokenSequencePattern pattern = TokenSequencePattern.Compile(line); tokenSequencePatterns.Add(pattern); } IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); int i = 0; foreach (ICoreMap sentence in sentences) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); @out.Println("Sentence #" + ++i); @out.Print(" Tokens:"); foreach (CoreLabel token in tokens) { @out.Print(' '); @out.Print(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag")); } @out.Println(); MultiPatternMatcher <ICoreMap> multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns); IList <ISequenceMatchResult <ICoreMap> > answers = multiMatcher.FindNonOverlapping(tokens); int j = 0; foreach (ISequenceMatchResult <ICoreMap> matched in answers) { @out.Println(" Match #" + ++j); for (int k = 0; k <= matched.GroupCount(); k++) { @out.Println(" group " + k + " = " + matched.Group(k)); } } } @out.Flush(); }
public ApplyPatternsMulti(IDictionary <string, DataInstance> sents, IList <string> sentids, IDictionary <TokenSequencePattern, E> patterns, string label, bool removeStopWordsFromSelectedPhrases, bool removePhrasesWithStopWords, ConstantsAndVariables cv) { //Set<String> ignoreWords; this.sents = sents; this.patterns = patterns; multiPatternMatcher = TokenSequencePattern.GetMultiPatternMatcher(patterns.Keys); this.sentids = sentids; this.label = label; this.removeStopWordsFromSelectedPhrases = removeStopWordsFromSelectedPhrases; this.removePhrasesWithStopWords = removePhrasesWithStopWords; this.constVars = cv; }
// static main only /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner")); Annotation annotation = new Annotation("Casey is 21. Sally Atkinson's age is 30."); pipeline.Annotate(annotation); IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>(); string[] patterns = new string[] { "(?$who [ ner: PERSON]+ ) /is/ (?$age [ pos: CD ] )", "(?$who [ ner: PERSON]+ ) /'s/ /age/ /is/ (?$age [ pos: CD ] )" }; foreach (string line in patterns) { TokenSequencePattern pattern = TokenSequencePattern.Compile(line); tokenSequencePatterns.Add(pattern); } MultiPatternMatcher <ICoreMap> multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns); int i = 0; foreach (ICoreMap sentence in sentences) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); System.Console.Out.WriteLine("Sentence #" + ++i); System.Console.Out.Write(" Tokens:"); foreach (CoreLabel token in tokens) { System.Console.Out.Write(' '); System.Console.Out.Write(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag")); } System.Console.Out.WriteLine(); IList <ISequenceMatchResult <ICoreMap> > answers = multiMatcher.FindNonOverlapping(tokens); int j = 0; foreach (ISequenceMatchResult <ICoreMap> matched in answers) { System.Console.Out.WriteLine(" Match #" + ++j); System.Console.Out.WriteLine(" match: " + matched.Group(0)); System.Console.Out.WriteLine(" who: " + matched.Group("$who")); System.Console.Out.WriteLine(" age: " + matched.Group("$age")); } } }
public TokensRegexAnnotator(string name, Properties props) { string prefix = (name == null) ? string.Empty : name + '.'; string[] files = PropertiesUtils.GetStringArray(props, prefix + "rules"); env = TokenSequencePattern.GetNewEnv(); env.Bind("options", options); if (PropertiesUtils.GetBool(props, prefix + "caseInsensitive")) { System.Console.Error.WriteLine("using case insensitive!"); env.SetDefaultStringMatchFlags(NodePattern.CaseInsensitive | Pattern.UnicodeCase); env.SetDefaultStringPatternFlags(Pattern.CaseInsensitive | Pattern.UnicodeCase); } if (files.Length != 0) { extractor = CoreMapExpressionExtractor.CreateExtractorFromFiles(env, files); } else { extractor = null; } verbose = PropertiesUtils.GetBool(props, prefix + "verbose", false); options.setTokenOffsets = PropertiesUtils.GetBool(props, prefix + "setTokenOffsets", options.setTokenOffsets); options.extractWithTokens = PropertiesUtils.GetBool(props, prefix + "extractWithTokens", options.extractWithTokens); options.flatten = PropertiesUtils.GetBool(props, prefix + "flatten", options.flatten); string matchedExpressionsAnnotationKeyName = props.GetProperty(prefix + "matchedExpressionsAnnotationKey"); if (matchedExpressionsAnnotationKeyName != null) { options.matchedExpressionsAnnotationKey = EnvLookup.LookupAnnotationKeyWithClassname(env, matchedExpressionsAnnotationKeyName); if (options.matchedExpressionsAnnotationKey == null) { string propName = prefix + "matchedExpressionsAnnotationKey"; throw new Exception("Cannot determine annotation key for " + propName + '=' + matchedExpressionsAnnotationKeyName); } } }
private void RunParallelApplyPats(IDictionary <string, DataInstance> sents, string label, E pattern, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICollection <CandidatePhrase> alreadyLabeledWords) { Redwood.Log(Redwood.Dbg, "Applying pattern " + pattern + " to a total of " + sents.Count + " sentences "); IList <string> notAllowedClasses = new List <string>(); IList <string> sentids = CollectionUtils.ToList(sents.Keys); if (constVars.doNotExtractPhraseAnyWordLabeledOtherClass) { foreach (string l in constVars.GetAnswerClass().Keys) { if (!l.Equals(label)) { notAllowedClasses.Add(l); } } notAllowedClasses.Add("OTHERSEM"); } IDictionary <TokenSequencePattern, E> surfacePatternsLearnedThisIterConverted = null; IDictionary <SemgrexPattern, E> depPatternsLearnedThisIterConverted = null; if (constVars.patternType.Equals(PatternFactory.PatternType.Surface)) { surfacePatternsLearnedThisIterConverted = new Dictionary <TokenSequencePattern, E>(); string patternStr = null; try { patternStr = pattern.ToString(notAllowedClasses); TokenSequencePattern pat = ((TokenSequencePattern)TokenSequencePattern.Compile(constVars.env[label], patternStr)); surfacePatternsLearnedThisIterConverted[pat] = pattern; } catch (Exception e) { log.Info("Error applying pattern " + patternStr + ". Probably an ill formed pattern (can be because of special symbols in label names). Contact the software developer."); throw; } } else { if (constVars.patternType.Equals(PatternFactory.PatternType.Dep)) { depPatternsLearnedThisIterConverted = new Dictionary <SemgrexPattern, E>(); SemgrexPattern pat = SemgrexPattern.Compile(pattern.ToString(notAllowedClasses), new Env(constVars.env[label].GetVariables())); depPatternsLearnedThisIterConverted[pat] = pattern; } else { throw new NotSupportedException(); } } //Apply the patterns and extract candidate phrases int num; int numThreads = constVars.numThreads; //If number of sentences is less, do not create so many threads if (sents.Count < 50) { numThreads = 1; } if (numThreads == 1) { num = sents.Count; } else { num = sents.Count / (numThreads - 1); } IExecutorService executor = Executors.NewFixedThreadPool(constVars.numThreads); IList <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > > list = new List <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E , Triple <string, int, int> >, ICollection <CandidatePhrase> > > >(); for (int i = 0; i < numThreads; i++) { ICallable <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > task = null; if (pattern.type.Equals(PatternFactory.PatternType.Surface)) { //Redwood.log(Redwood.DBG, "Applying pats: assigning sentences " + i*num + " to " +Math.min(sentids.size(), (i + 1) * num) + " to thread " + (i+1)); task = new ApplyPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), surfacePatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords , constVars); } else { task = new ApplyDepPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), depPatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords , constVars); } IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > submit = executor.Submit(task); list.Add(submit); } // Now retrieve the result foreach (IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > future in list) { try { Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > result = future.Get(); Redwood.Log(ConstantsAndVariables.extremedebug, "Pattern " + pattern + " extracted phrases " + result.First()); wordsandLemmaPatExtracted.AddAll(result.First()); matchedTokensByPat.AddAll(result.Second()); Sharpen.Collections.AddAll(alreadyLabeledWords, result.Third()); } catch (Exception e) { executor.ShutdownNow(); throw new Exception(e); } } executor.Shutdown(); }
public TokensRegexAnnotator(params string[] files) { env = TokenSequencePattern.GetNewEnv(); extractor = CoreMapExpressionExtractor.CreateExtractorFromFiles(env, files); verbose = false; }
/// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string rules; if (args.Length > 0) { rules = args[0]; } else { rules = "edu/stanford/nlp/ling/tokensregex/demo/rules/expr.rules.txt"; } PrintWriter @out; if (args.Length > 2) { @out = new PrintWriter(args[2]); } else { @out = new PrintWriter(System.Console.Out); } CoreMapExpressionExtractor <MatchedExpression> extractor = CoreMapExpressionExtractor.CreateExtractorFromFiles(TokenSequencePattern.GetNewEnv(), rules); StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner")); Annotation annotation; if (args.Length > 1) { annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1])); } else { annotation = new Annotation("( ( five plus three plus four ) * 2 ) divided by three"); } pipeline.Annotate(annotation); // An Annotation is a Map and you can get and use the various analyses individually. @out.Println(); // The toString() method on an Annotation just prints the text of the Annotation // But you can see what is in it with other methods like toShorterString() @out.Println("The top level annotation"); @out.Println(annotation.ToShorterString()); IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); int i = 0; foreach (ICoreMap sentence in sentences) { @out.Println("Sentence #" + ++i); foreach (CoreLabel token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation))) { @out.Println(" Token: " + "word=" + token.Get(typeof(CoreAnnotations.TextAnnotation)) + ", pos=" + token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)) + ", ne=" + token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation))); } IList <MatchedExpression> matchedExpressions = extractor.ExtractExpressions(sentence); foreach (MatchedExpression matched in matchedExpressions) { // Print out matched text and value @out.Println("Matched expression: " + matched.GetText() + " with value " + matched.GetValue()); // Print out token information ICoreMap cm = matched.GetAnnotation(); foreach (CoreLabel token_1 in cm.Get(typeof(CoreAnnotations.TokensAnnotation))) { string word = token_1.Get(typeof(CoreAnnotations.TextAnnotation)); string lemma = token_1.Get(typeof(CoreAnnotations.LemmaAnnotation)); string pos = token_1.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)); string ne = token_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); @out.Println(" Matched token: " + "word=" + word + ", lemma=" + lemma + ", pos=" + pos + ", ne=" + ne); } } } @out.Flush(); }
private void InitEnv() { env = TokenSequencePattern.GetNewEnv(); env.SetDefaultResultsAnnotationExtractor(TimeExpression.TimeExpressionConverter); env.SetDefaultTokensAnnotationKey(typeof(CoreAnnotations.NumerizedTokensAnnotation)); env.SetDefaultResultAnnotationKey(typeof(TimeExpression.Annotation)); env.SetDefaultNestedResultsAnnotationKey(typeof(TimeExpression.ChildrenAnnotation)); env.SetDefaultTokensAggregators(CoreMapAttributeAggregator.DefaultNumericTokensAggregators); env.Bind("nested", typeof(TimeExpression.ChildrenAnnotation)); env.Bind("time", new TimeFormatter.TimePatternExtractRuleCreator()); // Do case insensitive matching env.SetDefaultStringPatternFlags(Pattern.CaseInsensitive | Pattern.UnicodeCase); env.Bind("options", options); env.Bind("TIME_REF", SUTime.TimeRef); env.Bind("TIME_REF_UNKNOWN", SUTime.TimeRefUnknown); env.Bind("TIME_UNKNOWN", SUTime.TimeUnknown); env.Bind("TIME_NONE", SUTime.TimeNone); env.Bind("ERA_AD", SUTime.EraAd); env.Bind("ERA_BC", SUTime.EraBc); env.Bind("ERA_UNKNOWN", SUTime.EraUnknown); env.Bind("HALFDAY_AM", SUTime.HalfdayAm); env.Bind("HALFDAY_PM", SUTime.HalfdayPm); env.Bind("HALFDAY_UNKNOWN", SUTime.HalfdayUnknown); env.Bind("RESOLVE_TO_THIS", SUTime.ResolveToThis); env.Bind("RESOLVE_TO_PAST", SUTime.ResolveToPast); env.Bind("RESOLVE_TO_FUTURE", SUTime.ResolveToFuture); env.Bind("RESOLVE_TO_CLOSEST", SUTime.ResolveToClosest); env.Bind("numcomptype", typeof(CoreAnnotations.NumericCompositeTypeAnnotation)); env.Bind("numcompvalue", typeof(CoreAnnotations.NumericCompositeValueAnnotation)); env.Bind("temporal", typeof(TimeExpression.Annotation)); // env.bind("tags", SequenceMatchRules.Tags.TagsAnnotation.class); env.Bind("::IS_TIMEX_DATE", new GenericTimeExpressionPatterns.TimexTypeMatchNodePattern(SUTime.TimexType.Date)); env.Bind("::IS_TIMEX_DURATION", new GenericTimeExpressionPatterns.TimexTypeMatchNodePattern(SUTime.TimexType.Duration)); env.Bind("::IS_TIMEX_TIME", new GenericTimeExpressionPatterns.TimexTypeMatchNodePattern(SUTime.TimexType.Time)); env.Bind("::IS_TIMEX_SET", new GenericTimeExpressionPatterns.TimexTypeMatchNodePattern(SUTime.TimexType.Set)); env.Bind("::IS_TIME_UNIT", new GenericTimeExpressionPatterns.MatchedExpressionValueTypeMatchNodePattern("TIMEUNIT")); env.Bind("::MONTH", new GenericTimeExpressionPatterns.MatchedExpressionValueTypeMatchNodePattern("MONTH_OF_YEAR")); env.Bind("::DAYOFWEEK", new GenericTimeExpressionPatterns.MatchedExpressionValueTypeMatchNodePattern("DAY_OF_WEEK")); // BINDINGS for parsing from file!!!!!!! foreach (SUTime.TemporalOp t in SUTime.TemporalOp.Values()) { env.Bind(t.ToString(), new Expressions.PrimitiveValue <SUTime.TemporalOp>("TemporalOp", t)); } foreach (SUTime.TimeUnit t_1 in SUTime.TimeUnit.Values()) { if (!t_1.Equals(SUTime.TimeUnit.Unknown)) { //env.bind(t.name(), new SequenceMatchRules.PrimitiveValue<SUTime.Temporal>("DURATION", t.getDuration(), "TIMEUNIT")); env.Bind(t_1.ToString(), new Expressions.PrimitiveValue <SUTime.Temporal>("TIMEUNIT", t_1.GetDuration())); } } foreach (SUTime.StandardTemporalType t_2 in SUTime.StandardTemporalType.Values()) { env.Bind(t_2.ToString(), new Expressions.PrimitiveValue <SUTime.StandardTemporalType>("TemporalType", t_2)); } env.Bind("Duration", new Expressions.PrimitiveValue <IValueFunction>(Expressions.TypeFunction, new _NamedValueFunction_124("Duration"))); // New so we get different time ids // TODO: Check args // TODO: Handle Strings... // TODO: This should already be in durations.... //String durationUnitString = (durationUnitTokens != null)? durationUnitTokens.get(0).get(CoreAnnotations.TextAnnotation.class):null; //SUTime.Duration durationUnit = getDuration(durationUnitString); // TODO: Handle inexactness // Create duration range... // Add begin and end times env.Bind("DayOfWeek", new Expressions.PrimitiveValue <IValueFunction>(Expressions.TypeFunction, new _NamedValueFunction_212("DayOfWeek"))); env.Bind("MonthOfYear", new Expressions.PrimitiveValue <IValueFunction>(Expressions.TypeFunction, new _NamedValueFunction_235("MonthOfYear"))); env.Bind("MakePeriodicTemporalSet", new Expressions.PrimitiveValue <IValueFunction>(Expressions.TypeFunction, new _NamedValueFunction_258("MakePeriodicTemporalSet"))); // First argument is the temporal acting as the base of the periodic set // Second argument is the quantifier (string) // Third argument is the multiple (how much to scale the natural period) /*"P1X"*/ env.Bind("TemporalCompose", new Expressions.PrimitiveValue <IValueFunction>(Expressions.TypeFunction, new _NamedValueFunction_328("TemporalCompose"))); }
public WordsToSentencesAnnotator(Properties properties) { bool nlSplitting = bool.ValueOf(properties.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false")); if (nlSplitting) { bool whitespaceTokenization = bool.ValueOf(properties.GetProperty("tokenize.whitespace", "false")); if (whitespaceTokenization) { if (Runtime.LineSeparator().Equals("\n")) { // this constructor will keep empty lines as empty sentences WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { "\n" })); this.countLineNumbers = true; this.wts = wts1; } else { // throw "\n" in just in case files use that instead of // the system separator // this constructor will keep empty lines as empty sentences WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { Runtime.LineSeparator(), "\n" })); this.countLineNumbers = true; this.wts = wts1; } } else { // this constructor will keep empty lines as empty sentences WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { PTBTokenizer.GetNewlineToken() })); this.countLineNumbers = true; this.wts = wts1; } } else { string isOneSentence = properties.GetProperty("ssplit.isOneSentence"); if (bool.Parse(isOneSentence)) { // this method treats null as false // Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence. WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(true); this.countLineNumbers = false; this.wts = wts1; } else { // multi token sentence boundaries string boundaryMultiTokenRegex = properties.GetProperty("ssplit.boundaryMultiTokenRegex"); // Discard these tokens without marking them as sentence boundaries string tokenPatternsToDiscardProp = properties.GetProperty("ssplit.tokenPatternsToDiscard"); ICollection <string> tokenRegexesToDiscard = null; if (tokenPatternsToDiscardProp != null) { string[] toks = tokenPatternsToDiscardProp.Split(","); tokenRegexesToDiscard = Generics.NewHashSet(Arrays.AsList(toks)); } // regular boundaries string boundaryTokenRegex = properties.GetProperty("ssplit.boundaryTokenRegex"); string boundaryFollowersRegex = properties.GetProperty("ssplit.boundaryFollowersRegex"); // newline boundaries which are discarded. ICollection <string> boundariesToDiscard = null; string bounds = properties.GetProperty("ssplit.boundariesToDiscard"); if (bounds != null) { string[] toks = bounds.Split(","); boundariesToDiscard = Generics.NewHashSet(Arrays.AsList(toks)); } ICollection <string> htmlElementsToDiscard = null; // HTML boundaries which are discarded bounds = properties.GetProperty("ssplit.htmlBoundariesToDiscard"); if (bounds != null) { string[] elements = bounds.Split(","); htmlElementsToDiscard = Generics.NewHashSet(Arrays.AsList(elements)); } string nlsb = properties.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, StanfordCoreNLP.DefaultNewlineIsSentenceBreak); this.countLineNumbers = false; this.wts = new WordToSentenceProcessor <CoreLabel>(boundaryTokenRegex, boundaryFollowersRegex, boundariesToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.StringToNewlineIsSentenceBreak(nlsb), (boundaryMultiTokenRegex != null) ? TokenSequencePattern .Compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard); } } Verbose = bool.ValueOf(properties.GetProperty("ssplit.verbose", "false")); }
public WordsToSentencesAnnotator(bool verbose, string boundaryTokenRegex, ICollection <string> boundaryToDiscard, ICollection <string> htmlElementsToDiscard, string newlineIsSentenceBreak, string boundaryMultiTokenRegex, ICollection <string> tokenRegexesToDiscard ) : this(verbose, false, new WordToSentenceProcessor <CoreLabel>(boundaryTokenRegex, null, boundaryToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.StringToNewlineIsSentenceBreak(newlineIsSentenceBreak), (boundaryMultiTokenRegex != null ) ? TokenSequencePattern.Compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard)) { }