public virtual void Annotate(Annotation annotation) { // iterate through each sentence, iterate through each entity mention in the sentence foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { foreach (ICoreMap entityMention in sentence.Get(typeof(CoreAnnotations.MentionsAnnotation))) { // if the entityMention is of type PERSON, see if name is in one of the lists for male and female names // annotate the entity mention's CoreMap if (entityMention.Get(typeof(CoreAnnotations.EntityTypeAnnotation)).Equals("PERSON")) { CoreLabel firstName = entityMention.Get(typeof(CoreAnnotations.TokensAnnotation))[0]; if (maleNames.Contains(firstName.Word().ToLower())) { AnnotateEntityMention(entityMention, "MALE"); } else { if (femaleNames.Contains(firstName.Word().ToLower())) { AnnotateEntityMention(entityMention, "FEMALE"); } } } } } }
private void AddAcronyms(Annotation ann) { // Find all the organizations in a document IList <ICoreMap> allMentionsSoFar = new List <ICoreMap>(); foreach (ICoreMap sentence in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { Sharpen.Collections.AddAll(allMentionsSoFar, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation))); } IList <IList <CoreLabel> > organizations = new List <IList <CoreLabel> >(); foreach (ICoreMap mention in allMentionsSoFar) { if ("ORGANIZATION".Equals(mention.Get(nerCoreAnnotationClass))) { organizations.Add(mention.Get(typeof(CoreAnnotations.TokensAnnotation))); } } // Skip very long documents if (organizations.Count > 100) { return; } // Iterate over tokens... foreach (ICoreMap sentence_1 in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <ICoreMap> sentenceMentions = new List <ICoreMap>(); IList <CoreLabel> tokens = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation)); int totalTokensOffset = sentence_1.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); for (int i = 0; i < tokens.Count; ++i) { // ... that look like they might be an acronym and are not already a mention CoreLabel token = tokens[i]; if ("O".Equals(token.Ner()) && token.Word().ToUpper().Equals(token.Word()) && token.Word().Length >= 3) { foreach (IList <CoreLabel> org in organizations) { // ... and actually are an acronym if (AcronymMatcher.IsAcronym(token.Word(), org)) { // ... and add them. // System.out.println("found ACRONYM ORG"); token.SetNER("ORGANIZATION"); ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null); chunk.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "ORGANIZATION"); sentenceMentions.Add(chunk); } } } } } }
private bool MatchedRestriction(CoreLabel coreLabel, string label) { bool use = false; if (PatternFactory.useTargetNERRestriction) { foreach (string s in constVars.allowedNERsforLabels[label]) { if (coreLabel.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Matches(s)) { use = true; break; } } } else { //System.out.println("not matching NER"); use = true; } if (use) { string tag = coreLabel.Tag(); if (constVars.allowedTagsInitials != null && constVars.allowedTagsInitials.Contains(label)) { foreach (string allowed in constVars.allowedTagsInitials[label]) { if (tag.StartsWith(allowed)) { use = true; break; } use = false; } } } if (constVars.debug >= 4) { if (use) { System.Console.Out.WriteLine(coreLabel.Word() + " matched restriction " + (PatternFactory.useTargetNERRestriction ? constVars.allowedNERsforLabels[label] : string.Empty) + "and" + PatternFactory.useTargetNERRestriction + " and " + (constVars .allowedTagsInitials != null ? constVars.allowedTagsInitials[label] : string.Empty)); } else { System.Console.Out.WriteLine(coreLabel.Word() + " did not matched restrict " + (PatternFactory.useTargetNERRestriction ? constVars.allowedNERsforLabels[label] : string.Empty) + "and" + PatternFactory.useTargetNERRestriction + " and " + (constVars .allowedTagsInitials != null ? constVars.allowedTagsInitials[label] : string.Empty)); } } return(use); }
private static bool ContainsStopWord(CoreLabel l, ICollection <string> commonEngWords, Pattern ignoreWordRegex) { // if(useWordResultCache.containsKey(l.word())) // return useWordResultCache.get(l.word()); if ((commonEngWords.Contains(l.Lemma()) || commonEngWords.Contains(l.Word())) || (ignoreWordRegex != null && ignoreWordRegex.Matcher(l.Lemma()).Matches())) { //|| (ignoreWords !=null && (ignoreWords.contains(l.lemma()) || ignoreWords.contains(l.word())))) { // useWordResultCache.putIfAbsent(l.word(), false); return(true); } // // if (l.word().length() >= minLen4Fuzzy) { // try { // String matchedFuzzy = NoisyLabelSentences.containsFuzzy(commonEngWords, // l.word(), minLen4Fuzzy); // if (matchedFuzzy != null) { // synchronized (commonEngWords) { // commonEngWords.add(l.word()); // System.out.println("word is " + l.word() + " and matched fuzzy with " + // matchedFuzzy); // } // useWordResultCache.putIfAbsent(l.word(), false); // return false; // } // } catch (Exception e) { // e.printStackTrace(); // System.out.println("Exception " + " while fuzzy matching " + l.word()); // } // } // useWordResultCache.putIfAbsent(l.word(), true); return(false); }
protected internal override IHasWord GetNext() { while (wordIter == null || !wordIter.MoveNext()) { if (!tok.MoveNext()) { return(null); } CoreLabel token = tok.Current; string s = token.Word(); if (s == null) { return(null); } if (s.Equals(WhitespaceLexer.Newline)) { // if newlines were significant, we should make sure to return // them when we see them IList <IHasWord> se = Java.Util.Collections.SingletonList <IHasWord>(token); wordIter = se.GetEnumerator(); } else { IList <IHasWord> se = wordSegmenter.Segment(s); wordIter = se.GetEnumerator(); } } return(wordIter.Current); }
/// <summary> /// Handles contractions like del and al, marked by the lexer /// del => de + l => de + el /// al => a + l => a + el /// con[mts]igo => con + [mts]i /// </summary> private CoreLabel ProcessContraction(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string word = cl.Word(); string first; string second; int secondOffset = 0; int secondLength = 0; string lowered = word.ToLower(); switch (lowered) { case "del": case "al": { first = Sharpen.Runtime.Substring(word, 0, lowered.Length - 1); char lastChar = word[lowered.Length - 1]; if (char.IsLowerCase(lastChar)) { second = "el"; } else { second = "EL"; } secondOffset = 1; secondLength = lowered.Length - 1; break; } case "conmigo": case "consigo": { first = Sharpen.Runtime.Substring(word, 0, 3); second = word[3] + "í"; secondOffset = 3; secondLength = 4; break; } case "contigo": { first = Sharpen.Runtime.Substring(word, 0, 3); second = Sharpen.Runtime.Substring(word, 3, 5); secondOffset = 3; secondLength = 4; break; } default: { throw new ArgumentException("Invalid contraction provided to processContraction"); } } int secondStart = cl.BeginPosition() + secondOffset; int secondEnd = secondStart + secondLength; compoundBuffer.Add(CopyCoreLabel(cl, second, secondStart, secondEnd)); return(CopyCoreLabel(cl, first, cl.BeginPosition(), secondStart)); }
private string WordIndicator(CoreLabel cl1, CoreLabel cl2, string Pos) { string w1 = cl1 == null ? "NONE" : cl1.Word().ToLower(); string w2 = cl2 == null ? "NONE" : cl2.Word().ToLower(); return(WordIndicator(w1 + "_" + w2, Pos)); }
/// <summary>Get the text value of this entity.</summary> /// <remarks> /// Get the text value of this entity. /// The headTokenSpan MUST be set before calling this method! /// </remarks> public override string GetValue() { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); // int lastEnd = -1; StringBuilder sb = new StringBuilder(); for (int i = headTokenSpan.Start(); i < headTokenSpan.End(); i++) { CoreLabel token = tokens[i]; // we are not guaranteed to have CharacterOffsets so we can't use them... /* * Integer start = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); * Integer end = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); * * if (start != null && end != null) { * if (lastEnd != -1 && !start.equals(lastEnd)) { * sb.append(StringUtils.repeat(" ", start - lastEnd)); * lastEnd = end; * } * } else { * if (lastEnd != -1) sb.append(" "); * lastEnd = 0; * } */ if (i > headTokenSpan.Start()) { sb.Append(" "); } sb.Append(token.Word()); } return(sb.ToString()); }
protected internal virtual ICollection <string> FeaturesC(PaddedList <In> cInfo, int loc) { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel n = cInfo[loc + 1]; CoreLabel n2 = cInfo[loc + 2]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; string charc = c.Get(typeof(CoreAnnotations.CharAnnotation)); string charn = n.Get(typeof(CoreAnnotations.CharAnnotation)); string charn2 = n2.Get(typeof(CoreAnnotations.CharAnnotation)); string charp = p.Get(typeof(CoreAnnotations.CharAnnotation)); string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation)); // Default feature set...a 5 character window // plus a few other language-independent features features.Add(charc + "-c"); features.Add(charn + "-n1"); features.Add(charn2 + "-n2"); features.Add(charp + "-p"); features.Add(charp2 + "-p2"); // Length feature if (charc.Length > 1) { features.Add("length"); } // Character-level class features bool seenPunc = false; bool seenDigit = false; for (int i = 0; i < limit; ++i) { char charcC = charc[i]; seenPunc = seenPunc || Characters.IsPunctuation(charcC); seenDigit = seenDigit || char.IsDigit(charcC); string cuBlock = Characters.UnicodeBlockStringOf(charcC); features.Add(cuBlock + "-uBlock"); string cuType = char.GetType(charcC).ToString(); features.Add(cuType + "-uType"); } if (seenPunc) { features.Add("haspunc"); } if (seenDigit) { features.Add("hasdigit"); } // Token-level features string word = c.Word(); int index = c.Index(); features.Add(Math.Min(MaxBefore, index) + "-before"); features.Add(Math.Min(MaxAfter, word.Length - charc.Length - index) + "-after"); features.Add(Math.Min(MaxLength, word.Length) + "-length"); // Indicator transition feature features.Add("cliqueC"); return(features); }
private string WordIndicator(CoreLabel cl, string Pos) { if (cl == null) { return("NONE"); } return(WordIndicator(cl.Word().ToLower(), Pos)); }
private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i) { ICounter <string> feat = new ClassicCounter <string>(); CoreLabel l = sent[i]; string label; if (l.Get(answerClass).ToString().Equals(answerLabel)) { label = answerLabel; } else { label = "O"; } CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases)); if (matchedPhrases == null) { matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>(); matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word())); } foreach (CandidatePhrase w in matchedPhrases.AllValues()) { int num = this.clusterIds[w.GetPhrase()]; if (num == null) { num = -1; } feat.SetCount("Cluster-" + num, 1.0); } // feat.incrementCount("WORD-" + l.word()); // feat.incrementCount("LEMMA-" + l.lemma()); // feat.incrementCount("TAG-" + l.tag()); int window = 0; for (int j = Math.Max(0, i - window); j < i; j++) { CoreLabel lj = sent[j]; feat.IncrementCount("PREV-" + "WORD-" + lj.Word()); feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("PREV-" + "TAG-" + lj.Tag()); } for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++) { CoreLabel lj = sent[j_1]; feat.IncrementCount("NEXT-" + "WORD-" + lj.Word()); feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag()); } // System.out.println("adding " + l.word() + " as " + label); return(new RVFDatum <string, string>(feat, label)); }
protected internal override ICollection <string> FeaturesCpC(PaddedList <IN> cInfo, int loc) { ICollection <string> features = base.FeaturesCpC(cInfo, loc); CoreLabel c = cInfo[loc]; // "Wrapper" feature: identity of first and last two chars of the current word. // This helps detect ma+_+sh in dialect, as well as avoiding segmenting possessive // pronouns if the word starts with al-. if (c.Word().Length > 3) { string start = Sharpen.Runtime.Substring(c.Word(), 0, 2); string end = Sharpen.Runtime.Substring(c.Word(), c.Word().Length - 2); if (c.Index() == 2) { features.Add(start + "_" + end + "-begin-wrap"); } if (c.Index() == c.Word().Length - 1) { features.Add(start + "_" + end + "-end-wrap"); } } return(features); }
/// <summary>Splits a compound marked by the lexer.</summary> private CoreLabel ProcessCompound(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string[] parts = cl.Word().ReplaceAll("-", " - ").Split("\\s+"); foreach (string part in parts) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); compoundBuffer.Add(newLabel); } return(compoundBuffer.Remove(0)); }
/// <summary>Splits a contraction marked by the lexer.</summary> /// <remarks> /// Splits a contraction marked by the lexer. /// au => a + u => à + le /// aux => a + ux => à + les /// des => de + s => de + les /// du => d + u => de + le /// </remarks> private CoreLabel ProcessContraction(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string word = cl.Word(); string first; string second; int secondOffset = 0; int secondLength = 0; string lowered = word.ToLower(); switch (lowered) { case "au": { first = "à"; second = "le"; secondOffset = 1; secondLength = 1; break; } case "aux": { first = "à"; second = "les"; secondOffset = 1; secondLength = 2; break; } case "du": { first = "de"; second = "le"; secondOffset = 1; secondLength = 1; break; } default: { throw new ArgumentException("Invalid contraction provided to processContraction"); } } int secondStart = cl.BeginPosition() + secondOffset; int secondEnd = secondStart + secondLength; compoundBuffer.Add(CopyCoreLabel(cl, second, secondStart, secondEnd)); return(CopyCoreLabel(cl, first, cl.BeginPosition(), secondStart)); }
private static IList <string> GetContentWords(Mention m) { IList <string> words = new List <string>(); for (int i = m.startIndex; i < m.endIndex; i++) { CoreLabel cl = m.sentenceWords[i]; string Pos = cl.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)); if (Pos.Equals("NN") || Pos.Equals("NNS") || Pos.Equals("NNP") || Pos.Equals("NNPS")) { words.Add(cl.Word().ToLower()); } } return(words); }
public virtual IList <int> ScanForAnimates(Pair <int, int> span) { IList <int> animateIndices = new List <int>(); IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); for (int i = span.first; i <= span.second && i < tokens.Count; i++) { CoreLabel token = tokens[i]; if (animacySet.Contains(token.Word())) { animateIndices.Add(i); } } return(animateIndices); }
private static ICollection <string> GetPropers(Mention m) { ICollection <string> propers = new HashSet <string>(); for (int i = m.startIndex; i < m.endIndex; i++) { CoreLabel cl = m.sentenceWords[i]; string Pos = cl.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)); string word = cl.Word().ToLower(); if (Propers.Contains(Pos)) { propers.Add(word); } } return(propers); }
public virtual string GetExtentString() { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); StringBuilder sb = new StringBuilder(); for (int i = extentTokenSpan.Start(); i < extentTokenSpan.End(); i++) { CoreLabel token = tokens[i]; if (i > extentTokenSpan.Start()) { sb.Append(" "); } sb.Append(token.Word()); } return(sb.ToString()); }
private void SetTrueCaseText(CoreLabel l) { string trueCase = l.GetString <CoreAnnotations.TrueCaseAnnotation>(); string text = l.Word(); string trueCaseText = text; switch (trueCase) { case "UPPER": { trueCaseText = text.ToUpper(); break; } case "LOWER": { trueCaseText = text.ToLower(); break; } case "INIT_UPPER": { trueCaseText = char.ToTitleCase(text[0]) + Sharpen.Runtime.Substring(text, 1).ToLower(); break; } case "O": { // The model predicted mixed case, so lookup the map: string lower = text.ToLower(); if (mixedCaseMap.Contains(lower)) { trueCaseText = mixedCaseMap[lower]; } // else leave it as it was? break; } } // System.err.println(text + " was classified as " + trueCase + " and so became " + trueCaseText); l.Set(typeof(CoreAnnotations.TrueCaseTextAnnotation), trueCaseText); if (overwriteText) { l.Set(typeof(CoreAnnotations.TextAnnotation), trueCaseText); l.Set(typeof(CoreAnnotations.ValueAnnotation), trueCaseText); } }
/// <summary>Splits a compound marked by the lexer.</summary> private CoreLabel ProcessCompound(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string[] parts = pSpace.Split(pDash.Matcher(cl.Word()).ReplaceAll(" - ")); int lengthAccum = 0; foreach (string part in parts) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.SetBeginPosition(cl.BeginPosition() + lengthAccum); newLabel.SetEndPosition(cl.BeginPosition() + lengthAccum + part.Length); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); compoundBuffer.Add(newLabel); lengthAccum += part.Length; } return(compoundBuffer.Remove(0)); }
public virtual void TestUsingIterator() { string s = "\n\n@@123\nthis\nis\na\nsentence\n\n@@12\nThis\nis another\n.\n\n"; string[] output = new string[] { "@@", "123", "this", "is", "a", "sentence", "@@", "12", "This", "is", "another", "." }; string[] outWSs = new string[] { "@@", "ddd", "xxxx", "xx", "x", "xxxxx", "@@", "dd", "Xxxx", "xx", "xxxxx", "." }; NUnit.Framework.Assert.AreEqual(output.Length, outWSs.Length, "Two output arrays should have same length"); Properties props = PropertiesUtils.AsProperties("wordShape", "chris2"); SeqClassifierFlags flags = new SeqClassifierFlags(props); PlainTextDocumentReaderAndWriter <CoreLabel> readerAndWriter = new PlainTextDocumentReaderAndWriter <CoreLabel>(); readerAndWriter.Init(flags); ReaderIteratorFactory rif = new ReaderIteratorFactory(new StringReader(s)); ObjectBank <IList <CoreLabel> > di = new ObjectBank <IList <CoreLabel> >(rif, readerAndWriter); ICollection <string> knownLCWords = new HashSet <string>(); ObjectBankWrapper <CoreLabel> obw = new ObjectBankWrapper <CoreLabel>(flags, di, knownLCWords); try { int outIdx = 0; for (IEnumerator <IList <CoreLabel> > iter = obw.GetEnumerator(); iter.MoveNext();) { IList <CoreLabel> sent = iter.Current; for (IEnumerator <CoreLabel> iter2 = sent.GetEnumerator(); iter2.MoveNext();) { CoreLabel cl = iter2.Current; string tok = cl.Word(); string shape = cl.Get(typeof(CoreAnnotations.ShapeAnnotation)); NUnit.Framework.Assert.AreEqual(output[outIdx], tok); NUnit.Framework.Assert.AreEqual(outWSs[outIdx], shape); outIdx++; } } if (outIdx < output.Length) { NUnit.Framework.Assert.Fail("Too few things in iterator, lacking: " + output[outIdx]); } } catch (Exception e) { NUnit.Framework.Assert.Fail("Probably too many things in iterator: " + e); } }
//Note: this doesn't necessarily find all possible candidates, but is kind of a greedy version. // E.g. "Elizabeth and Jane" will return only "Elizabeth and Jane", but not "Elizabeth", and "Jane" as well. public virtual Pair <List <string>, List <Pair <int, int> > > ScanForNamesNew(Pair <int, int> textRun) { List <string> potentialNames = new List <string>(); List <Pair <int, int> > nameIndices = new List <Pair <int, int> >(); IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); Sieve.TokenNode pointer = rootNameNode; for (int index = textRun.first; index <= textRun.second && index < tokens.Count; index++) { CoreLabel token = tokens[index]; string tokenText = token.Word(); // System.out.println(token); if (pointer.childNodes.Keys.Contains(tokenText)) { pointer = pointer.childNodes[tokenText]; } else { if (!pointer.token.Equals("$ROOT")) { if (pointer.fullName != null) { potentialNames.Add(pointer.fullName); nameIndices.Add(new Pair <int, int>(index - 1 - pointer.level, index - 1)); } pointer = rootNameNode; } } } int index_1 = textRun.second + 1; if (!pointer.token.Equals("$ROOT")) { //catch the end case if (pointer.fullName != null) { potentialNames.Add(pointer.fullName); nameIndices.Add(new Pair <int, int>(index_1 - 1 - pointer.level, index_1 - 1)); } pointer = rootNameNode; } return(new Pair <List <string>, List <Pair <int, int> > >(potentialNames, nameIndices)); }
public static void WriteConllFile(string outFile, IList <ICoreMap> sentences, IList <DependencyTree> trees) { try { PrintWriter output = IOUtils.GetPrintWriter(outFile); for (int i = 0; i < sentences.Count; i++) { ICoreMap sentence = sentences[i]; DependencyTree tree = trees[i]; IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); for (int j = 1; j <= size; ++j) { CoreLabel token = tokens[j - 1]; output.Printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.Word(), token.Tag(), token.Tag(), tree.GetHead(j), tree.GetLabel(j)); } output.Println(); } output.Close(); } catch (Exception e) { throw new RuntimeIOException(e); } }
// && !text.contains("+") && // !text.contains("*");// && ! // text.contains("$") && !text.contains("\""); public static IDictionary <int, ISet> GetPatternsAroundTokens(DataInstance sent, ICollection <CandidatePhrase> stopWords) { IDictionary <int, ISet> p = new Dictionary <int, ISet>(); IList <CoreLabel> tokens = sent.GetTokens(); for (int i = 0; i < tokens.Count; i++) { // p.put( // i, // new Triple<Set<Integer>, Set<Integer>, Set<Integer>>( // new HashSet<Integer>(), new HashSet<Integer>(), // new HashSet<Integer>())); p[i] = new HashSet <SurfacePattern>(); CoreLabel token = tokens[i]; // do not create patterns around stop words! if (PatternFactory.DoNotUse(token.Word(), stopWords)) { continue; } ICollection <SurfacePattern> pat = GetContext(sent.GetTokens(), i, stopWords); p[i] = pat; } return(p); }
/// <exception cref="System.Exception"/> public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); try { ICollection <CandidatePhrase> alreadyLabeledPhrases = new HashSet <CandidatePhrase>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent)); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory m.SetBranchLimit(5); while (m.Find()) { int s = m.Start("$term"); int e = m.End("$term"); System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label " + label); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // not needed as initialized false for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } SurfacePattern pSur = (SurfacePattern)pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); phrase = phrase.Trim(); if (!phrase.IsEmpty()) { phraseLemma = phraseLemma.Trim(); CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma); allFreq.IncrementCount(candPhrase, pEn.Value, 1.0); if (!useWordNotLabeled) { alreadyLabeledPhrases.Add(candPhrase); } } } } } } return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases)); } catch (Exception e) { logger.Error(e); throw; } }
public static void MungeLeaves(Tree tree, bool lemmasAsLeaves, bool addMorphoToLeaves) { IList <ILabel> labels = tree.Yield(); foreach (ILabel label in labels) { ++nTokens; if (!(label is CoreLabel)) { throw new ArgumentException("Only works with CoreLabels trees"); } CoreLabel coreLabel = (CoreLabel)label; string lemma = coreLabel.Lemma(); //PTB escaping since we're going to put this in the leaf if (lemma == null) { // No lemma, so just add the surface form lemma = coreLabel.Word(); } else { if (lemma.Equals("(")) { lemma = "-LRB-"; } else { if (lemma.Equals(")")) { lemma = "-RRB-"; } } } if (lemmasAsLeaves) { string escapedLemma = lemma; coreLabel.SetWord(escapedLemma); coreLabel.SetValue(escapedLemma); coreLabel.SetLemma(lemma); } if (addMorphoToLeaves) { string morphStr = coreLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = MorphoFeatureSpecification.NoAnalysis; } else { ++nMorphAnalyses; } // Normalize punctuation analyses if (morphStr.StartsWith("PONCT")) { morphStr = "PUNC"; } string newLeaf = string.Format("%s%s%s%s%s", coreLabel.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, morphStr); coreLabel.SetValue(newLeaf); coreLabel.SetWord(newLeaf); } } }
protected internal virtual Tree FindSyntacticHead(Mention m, Tree root, IList <CoreLabel> tokens) { // mention ends with 's int endIdx = m.endIndex; if (m.originalSpan.Count > 0) { string lastWord = m.originalSpan[m.originalSpan.Count - 1].Get(typeof(CoreAnnotations.TextAnnotation)); if ((lastWord.Equals("'s") || lastWord.Equals("'")) && m.originalSpan.Count != 1) { endIdx--; } } Tree exactMatch = FindTreeWithSpan(root, m.startIndex, endIdx); // // found an exact match // if (exactMatch != null) { return(SafeHead(exactMatch, endIdx)); } // no exact match found // in this case, we parse the actual extent of the mention, embedded in a sentence // context, so as to make the parser work better :-) if (allowReparsing) { int approximateness = 0; IList <CoreLabel> extentTokens = new List <CoreLabel>(); extentTokens.Add(InitCoreLabel("It")); extentTokens.Add(InitCoreLabel("was")); int AddedWords = 2; for (int i = m.startIndex; i < endIdx; i++) { // Add everything except separated dashes! The separated dashes mess with the parser too badly. CoreLabel label = tokens[i]; if (!"-".Equals(label.Word())) { // necessary to copy tokens in case the parser does things like // put new indices on the tokens extentTokens.Add((CoreLabel)label.LabelFactory().NewLabel(label)); } else { approximateness++; } } extentTokens.Add(InitCoreLabel(".")); // constrain the parse to the part we're interested in. // Starting from ADDED_WORDS comes from skipping "It was". // -1 to exclude the period. // We now let it be any kind of nominal constituent, since there // are VP and S ones ParserConstraint constraint = new ParserConstraint(AddedWords, extentTokens.Count - 1, Pattern.Compile(".*")); IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint); Tree tree = Parse(extentTokens, constraints); ConvertToCoreLabels(tree); // now unnecessary, as parser uses CoreLabels? tree.IndexSpans(m.startIndex - AddedWords); // remember it has ADDED_WORDS extra words at the beginning Tree subtree = FindPartialSpan(tree, m.startIndex); // There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word! // Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something // passed the right end (that is, just that final period). Tree extentHead = SafeHead(subtree, endIdx); System.Diagnostics.Debug.Assert((extentHead != null)); // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree // Because we deleted dashes, it's index will be >= the index in the extent parse tree CoreLabel l = (CoreLabel)extentHead.Label(); Tree realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness); System.Diagnostics.Debug.Assert((realHead != null)); return(realHead); } // If reparsing wasn't allowed, try to find a span in the tree // which happens to have the head Tree wordMatch = FindTreeWithSmallestSpan(root, m.startIndex, endIdx); if (wordMatch != null) { Tree head = SafeHead(wordMatch, endIdx); if (head != null) { int index = ((CoreLabel)head.Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1; if (index >= m.startIndex && index < endIdx) { return(head); } } } // If that didn't work, guess that it's the last word int lastNounIdx = endIdx - 1; for (int i_1 = m.startIndex; i_1 < m.endIndex; i_1++) { if (tokens[i_1].Tag().StartsWith("N")) { lastNounIdx = i_1; } else { if (tokens[i_1].Tag().StartsWith("W")) { break; } } } IList <Tree> leaves = root.GetLeaves(); Tree endLeaf = leaves[lastNounIdx]; return(endLeaf); }
/* * public void applyPats(Counter<E> patterns, String label, boolean computeDataFreq, TwoDimensionalCounter<Pair<String, String>, Integer> wordsandLemmaPatExtracted, * CollectionValuedMap<Integer, Triple<String, Integer, Integer>> matchedTokensByPat) throws ClassNotFoundException, IOException, InterruptedException, ExecutionException{ * Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>(); * Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>(); * Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList(); * List<String> extremelySmallStopWordsList = Arrays.asList(".",",","in","on","of","a","the","an"); * * for(Entry<Integer, Double> en: patterns.entrySet()){ * Integer pindex = en.getKey(); * SurfacePattern p = constVars.getPatternIndex().get(pindex); * String[] n = p.getSimplerTokensNext(); * String[] pr = p.getSimplerTokensPrev(); * boolean rest = false; * if(n!=null){ * for(String e: n){ * if(!specialWords.contains(e)){ * rest = true; * break; * } * } * } * if(rest == false && pr!=null){ * for(String e: pr){ * if(!specialWords.contains(e) && !extremelySmallStopWordsList.contains(e)){ * rest = true; * break; * } * } * } * if(rest) * patternsLearnedThisIterRest.setCount(en.getKey(), en.getValue()); * else * patternsLearnedThisIterConsistsOnlyGeneralized.setCount(en.getKey(), en.getValue()); * } * * * * Map<String, Set<String>> sentidswithfilerest = constVars.invertedIndex.getFileSentIdsFromPats(patternsLearnedThisIterRest.keySet(), constVars.getPatternIndex()); * * if (constVars.batchProcessSents) { * List<File> filesToLoad; * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0) * filesToLoad = Data.sentsFiles; * else{ * filesToLoad = new ArrayList<File>(); * for (String fname : sentidswithfilerest.keySet()) { * String filename; * // if(!constVars.usingDirForSentsInIndex) * // filename = constVars.saveSentencesSerDir+"/"+fname; * // else * filename = fname; * filesToLoad.add(new File(filename)); * } * } * * for (File fname : filesToLoad) { * Redwood.log(Redwood.DBG, "Applying patterns to sents from " + fname); * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(fname); * * if(sentidswithfilerest != null && !sentidswithfilerest.isEmpty()){ * * String filename; * // if(constVars.usingDirForSentsInIndex) * // filename = constVars.saveSentencesSerDir+"/"+fname.getName(); * // else * filename = fname.getAbsolutePath(); * * Set<String> sentIDs = sentidswithfilerest.get(filename); * if (sentIDs != null){ * this.runParallelApplyPats(sents, sentIDs, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat); * } else * Redwood.log(Redwood.DBG, "No sentIds for " + filename + " in the index for the keywords from the patterns! The index came up with these files: " + sentidswithfilerest.keySet()); * } * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){ * this.runParallelApplyPats(sents, sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat); * } * * if (computeDataFreq){ * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound); * Data.fileNamesUsedToComputeRawFreq.add(fname.getName()); * } * } * * //Compute Frequency from the files not loaded using the invertedindex query. otherwise, later on there is an error. * if(computeDataFreq){ * for(File f: Data.sentsFiles){ * if(!Data.fileNamesUsedToComputeRawFreq.contains(f.getName())){ * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(f); * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound); * Data.fileNamesUsedToComputeRawFreq.add(f.getName()); * } * } * } * * } else { * * if (sentidswithfilerest != null && !sentidswithfilerest.isEmpty()) { * String filename = CollectionUtils.toList(sentidswithfilerest.keySet()).get(0); * Set<String> sentids = sentidswithfilerest.get(filename); * if (sentids != null) { * this.runParallelApplyPats(Data.sents, sentids, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat); * } else * throw new RuntimeException("How come no sentIds for " + filename + ". Index keyset is " + constVars.invertedIndex.getKeySet()); * } * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){ * this.runParallelApplyPats(Data.sents, Data.sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat); * } * Data.computeRawFreqIfNull(Data.sents, constVars.numWordsCompound); * } * Redwood.log(Redwood.DBG, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.size()); * } */ private void StatsWithoutApplyingPatterns(IDictionary <string, DataInstance> sents, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted) { foreach (KeyValuePair <string, DataInstance> sentEn in sents) { IDictionary <int, ICollection <E> > pat4Sent = patternsForEachToken.GetPatternsForAllTokens(sentEn.Key); if (pat4Sent == null) { throw new Exception("How come there are no patterns for " + sentEn.Key); } foreach (KeyValuePair <int, ICollection <E> > en in pat4Sent) { CoreLabel token = null; ICollection <E> p1 = en.Value; // Set<Integer> p1 = en.getValue().first(); // Set<Integer> p2 = en.getValue().second(); // Set<Integer> p3 = en.getValue().third(); foreach (E index in patternsLearnedThisIter.KeySet()) { if (p1.Contains(index)) { if (token == null) { token = sentEn.Value.GetTokens()[en.Key]; } wordsandLemmaPatExtracted.IncrementCount(CandidatePhrase.CreateOrGet(token.Word(), token.Lemma()), index); } } } } }
private ICounter <string> GetFeatures(Document doc, Mention m, IDictionary <int, IList <Mention> > mentionsByHeadIndex) { ICounter <string> features = new ClassicCounter <string>(); // type features features.IncrementCount("mention-type=" + m.mentionType); features.IncrementCount("gender=" + m.gender); features.IncrementCount("person-fine=" + m.person); features.IncrementCount("head-ne-type=" + m.nerString); IList <string> singletonFeatures = m.GetSingletonFeatures(dictionaries); foreach (KeyValuePair <int, string> e in SingletonFeatures) { if (e.Key < singletonFeatures.Count) { features.IncrementCount(e.Value + "=" + singletonFeatures[e.Key]); } } // length and location features AddNumeric(features, "mention-length", m.SpanToString().Length); AddNumeric(features, "mention-words", m.originalSpan.Count); AddNumeric(features, "sentence-words", m.sentenceWords.Count); features.IncrementCount("sentence-words=" + Bin(m.sentenceWords.Count)); features.IncrementCount("mention-position", m.mentionNum / (double)doc.predictedMentions.Count); features.IncrementCount("sentence-position", m.sentNum / (double)doc.numSentences); // lexical features CoreLabel firstWord = FirstWord(m); CoreLabel lastWord = LastWord(m); CoreLabel headWord = HeadWord(m); CoreLabel prevWord = PrevWord(m); CoreLabel nextWord = NextWord(m); CoreLabel prevprevWord = PrevprevWord(m); CoreLabel nextnextWord = NextnextWord(m); string headPOS = GetPOS(headWord); string firstPOS = GetPOS(firstWord); string lastPOS = GetPOS(lastWord); string prevPOS = GetPOS(prevWord); string nextPOS = GetPOS(nextWord); string prevprevPOS = GetPOS(prevprevWord); string nextnextPOS = GetPOS(nextnextWord); features.IncrementCount("first-word=" + WordIndicator(firstWord, firstPOS)); features.IncrementCount("last-word=" + WordIndicator(lastWord, lastPOS)); features.IncrementCount("head-word=" + WordIndicator(headWord, headPOS)); features.IncrementCount("next-word=" + WordIndicator(nextWord, nextPOS)); features.IncrementCount("prev-word=" + WordIndicator(prevWord, prevPOS)); features.IncrementCount("next-bigram=" + WordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS)); features.IncrementCount("prev-bigram=" + WordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS)); features.IncrementCount("next-pos=" + nextPOS); features.IncrementCount("prev-pos=" + prevPOS); features.IncrementCount("first-pos=" + firstPOS); features.IncrementCount("last-pos=" + lastPOS); features.IncrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS); features.IncrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS); AddDependencyFeatures(features, "parent", GetDependencyParent(m), true); AddFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1); AddFeature(features, "is-generic", m.originalSpan.Count == 1 && firstPOS.Equals("NNS")); // syntax features IndexedWord w = m.headIndexedWord; string depPath = string.Empty; int depth = 0; while (w != null) { SemanticGraphEdge e_1 = GetDependencyParent(m, w); depth++; if (depth <= 3 && e_1 != null) { depPath += (depPath.IsEmpty() ? string.Empty : "_") + e_1.GetRelation().ToString(); features.IncrementCount("dep-path=" + depPath); w = e_1.GetSource(); } else { w = null; } } if (useConstituencyParse) { int fullEmbeddingLevel = HeadEmbeddingLevel(m.contextParseTree, m.headIndex); int mentionEmbeddingLevel = HeadEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex); if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1) { features.IncrementCount("mention-embedding-level=" + Bin(fullEmbeddingLevel - mentionEmbeddingLevel)); features.IncrementCount("head-embedding-level=" + Bin(mentionEmbeddingLevel)); } else { features.IncrementCount("undetermined-embedding-level"); } features.IncrementCount("num-embedded-nps=" + Bin(NumEmbeddedNps(m.mentionSubTree))); string syntaxPath = string.Empty; Tree tree = m.contextParseTree; Tree head = tree.GetLeaves()[m.headIndex].Ancestor(1, tree); depth = 0; foreach (Tree node in tree.PathNodeToNode(head, tree)) { syntaxPath += node.Value() + "-"; features.IncrementCount("syntax-path=" + syntaxPath); depth++; if (depth >= 4 || node.Value().Equals("S")) { break; } } } // mention containment features AddFeature(features, "contained-in-other-mention", mentionsByHeadIndex[m.headIndex].Stream().AnyMatch(null)); AddFeature(features, "contains-other-mention", mentionsByHeadIndex[m.headIndex].Stream().AnyMatch(null)); // features from dcoref rules AddFeature(features, "bare-plural", m.originalSpan.Count == 1 && headPOS.Equals("NNS")); AddFeature(features, "quantifier-start", dictionaries.quantifiers.Contains(firstWord.Word().ToLower())); AddFeature(features, "negative-start", firstWord.Word().ToLower().Matches("none|no|nothing|not")); AddFeature(features, "partitive", RuleBasedCorefMentionFinder.PartitiveRule(m, m.sentenceWords, dictionaries)); AddFeature(features, "adjectival-demonym", dictionaries.IsAdjectivalDemonym(m.SpanToString())); if (doc.docType != Document.DocType.Article && m.person == Dictionaries.Person.You && nextWord != null && Sharpen.Runtime.EqualsIgnoreCase(nextWord.Word(), "know")) { features.IncrementCount("generic-you"); } return(features); }
private ICounter <string> GetFeatures(Document doc, Mention m1, Mention m2) { System.Diagnostics.Debug.Assert((m1.AppearEarlierThan(m2))); ICounter <string> features = new ClassicCounter <string>(); // global features features.IncrementCount("bias"); if (useDocSource) { features.IncrementCount("doc-type=" + doc.docType); if (doc.docInfo != null && doc.docInfo.Contains("DOC_ID")) { features.IncrementCount("doc-source=" + doc.docInfo["DOC_ID"].Split("/")[1]); } } // singleton feature conjunctions IList <string> singletonFeatures1 = m1.GetSingletonFeatures(dictionaries); IList <string> singletonFeatures2 = m2.GetSingletonFeatures(dictionaries); foreach (KeyValuePair <int, string> e in SingletonFeatures) { if (e.Key < singletonFeatures1.Count && e.Key < singletonFeatures2.Count) { features.IncrementCount(e.Value + "=" + singletonFeatures1[e.Key] + "_" + singletonFeatures2[e.Key]); } } SemanticGraphEdge p1 = GetDependencyParent(m1); SemanticGraphEdge p2 = GetDependencyParent(m2); features.IncrementCount("dep-relations=" + (p1 == null ? "null" : p1.GetRelation()) + "_" + (p2 == null ? "null" : p2.GetRelation())); features.IncrementCount("roles=" + GetRole(m1) + "_" + GetRole(m2)); CoreLabel headCL1 = HeadWord(m1); CoreLabel headCL2 = HeadWord(m2); string headPOS1 = GetPOS(headCL1); string headPOS2 = GetPOS(headCL2); features.IncrementCount("head-pos-s=" + headPOS1 + "_" + headPOS2); features.IncrementCount("head-words=" + WordIndicator("h_" + headCL1.Word().ToLower() + "_" + headCL2.Word().ToLower(), headPOS1 + "_" + headPOS2)); // agreement features AddFeature(features, "animacies-agree", m2.AnimaciesAgree(m1)); AddFeature(features, "attributes-agree", m2.AttributesAgree(m1, dictionaries)); AddFeature(features, "entity-types-agree", m2.EntityTypesAgree(m1, dictionaries)); AddFeature(features, "numbers-agree", m2.NumbersAgree(m1)); AddFeature(features, "genders-agree", m2.GendersAgree(m1)); AddFeature(features, "ner-strings-equal", m1.nerString.Equals(m2.nerString)); // string matching features AddFeature(features, "antecedent-head-in-anaphor", HeadContainedIn(m1, m2)); AddFeature(features, "anaphor-head-in-antecedent", HeadContainedIn(m2, m1)); if (m1.mentionType != Dictionaries.MentionType.Pronominal && m2.mentionType != Dictionaries.MentionType.Pronominal) { AddFeature(features, "antecedent-in-anaphor", m2.SpanToString().ToLower().Contains(m1.SpanToString().ToLower())); AddFeature(features, "anaphor-in-antecedent", m1.SpanToString().ToLower().Contains(m2.SpanToString().ToLower())); AddFeature(features, "heads-equal", Sharpen.Runtime.EqualsIgnoreCase(m1.headString, m2.headString)); AddFeature(features, "heads-agree", m2.HeadsAgree(m1)); AddFeature(features, "exact-match", m1.ToString().Trim().ToLower().Equals(m2.ToString().Trim().ToLower())); AddFeature(features, "partial-match", RelaxedStringMatch(m1, m2)); double editDistance = StringUtils.EditDistance(m1.SpanToString(), m2.SpanToString()) / (double)(m1.SpanToString().Length + m2.SpanToString().Length); features.IncrementCount("edit-distance", editDistance); features.IncrementCount("edit-distance=" + ((int)(editDistance * 10) / 10.0)); double headEditDistance = StringUtils.EditDistance(m1.headString, m2.headString) / (double)(m1.headString.Length + m2.headString.Length); features.IncrementCount("head-edit-distance", headEditDistance); features.IncrementCount("head-edit-distance=" + ((int)(headEditDistance * 10) / 10.0)); } // distance features AddNumeric(features, "mention-distance", m2.mentionNum - m1.mentionNum); AddNumeric(features, "sentence-distance", m2.sentNum - m1.sentNum); if (m2.sentNum == m1.sentNum) { AddNumeric(features, "word-distance", m2.startIndex - m1.endIndex); if (m1.endIndex > m2.startIndex) { features.IncrementCount("spans-intersect"); } } // setup for dcoref features ICollection <Mention> ms1 = new HashSet <Mention>(); ms1.Add(m1); ICollection <Mention> ms2 = new HashSet <Mention>(); ms2.Add(m2); Random r = new Random(); CorefCluster c1 = new CorefCluster(20000 + r.NextInt(10000), ms1); CorefCluster c2 = new CorefCluster(10000 + r.NextInt(10000), ms2); string s2 = m2.LowercaseNormalizedSpanString(); string s1 = m1.LowercaseNormalizedSpanString(); // discourse dcoref features AddFeature(features, "mention-speaker-PER0", Sharpen.Runtime.EqualsIgnoreCase(m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)), "PER0")); AddFeature(features, "antecedent-is-anaphor-speaker", CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1, dictionaries)); AddFeature(features, "same-speaker", CorefRules.EntitySameSpeaker(doc, m2, m1)); AddFeature(features, "person-disagree-same-speaker", CorefRules.EntityPersonDisagree(doc, m2, m1, dictionaries) && CorefRules.EntitySameSpeaker(doc, m2, m1)); AddFeature(features, "antecedent-matches-anaphor-speaker", CorefRules.AntecedentMatchesMentionSpeakerAnnotation(m2, m1, doc)); AddFeature(features, "discourse-you-PER0", m2.person == Dictionaries.Person.You && doc.docType == Document.DocType.Article && m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0")); AddFeature(features, "speaker-match-i-i", m2.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s1) && m1.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s2) && CorefRules. EntitySameSpeaker(doc, m2, m1)); AddFeature(features, "speaker-match-speaker-i", m2.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s2) && CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1, dictionaries)); AddFeature(features, "speaker-match-i-speaker", m1.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s1) && CorefRules.AntecedentIsMentionSpeaker(doc, m1, m2, dictionaries)); AddFeature(features, "speaker-match-you-you", dictionaries.secondPersonPronouns.Contains(s1) && dictionaries.secondPersonPronouns.Contains(s2) && CorefRules.EntitySameSpeaker(doc, m2, m1)); AddFeature(features, "discourse-between-two-person", ((m2.person == Dictionaries.Person.I && m1.person == Dictionaries.Person.You || (m2.person == Dictionaries.Person.You && m1.person == Dictionaries.Person.I)) && (m2.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation )) - m1.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) == 1) && doc.docType == Document.DocType.Conversation)); AddFeature(features, "incompatible-not-match", m1.person != Dictionaries.Person.I && m2.person != Dictionaries.Person.I && (CorefRules.AntecedentIsMentionSpeaker(doc, m1, m2, dictionaries) || CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1 , dictionaries))); int utteranceDist = Math.Abs(m1.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) - m2.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation))); if (doc.docType != Document.DocType.Article && utteranceDist == 1 && !CorefRules.EntitySameSpeaker(doc, m2, m1)) { AddFeature(features, "speaker-mismatch-i-i", m1.person == Dictionaries.Person.I && m2.person == Dictionaries.Person.I); AddFeature(features, "speaker-mismatch-you-you", m1.person == Dictionaries.Person.You && m2.person == Dictionaries.Person.You); AddFeature(features, "speaker-mismatch-we-we", m1.person == Dictionaries.Person.We && m2.person == Dictionaries.Person.We); } // other dcoref features string firstWord1 = FirstWord(m1).Word().ToLower(); AddFeature(features, "indefinite-article-np", (m1.appositions == null && m1.predicateNominatives == null && (firstWord1.Equals("a") || firstWord1.Equals("an")))); AddFeature(features, "far-this", m2.LowercaseNormalizedSpanString().Equals("this") && Math.Abs(m2.sentNum - m1.sentNum) > 3); AddFeature(features, "per0-you-in-article", m2.person == Dictionaries.Person.You && doc.docType == Document.DocType.Article && m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0")); AddFeature(features, "inside-in", m2.InsideIn(m1) || m1.InsideIn(m2)); AddFeature(features, "indefinite-determiners", dictionaries.indefinitePronouns.Contains(m1.originalSpan[0].Lemma()) || dictionaries.indefinitePronouns.Contains(m2.originalSpan[0].Lemma())); AddFeature(features, "entity-attributes-agree", CorefRules.EntityAttributesAgree(c2, c1)); AddFeature(features, "entity-token-distance", CorefRules.EntityTokenDistance(m2, m1)); AddFeature(features, "i-within-i", CorefRules.EntityIWithinI(m2, m1, dictionaries)); AddFeature(features, "exact-string-match", CorefRules.EntityExactStringMatch(c2, c1, dictionaries, doc.roleSet)); AddFeature(features, "entity-relaxed-heads-agree", CorefRules.EntityRelaxedHeadsAgreeBetweenMentions(c2, c1, m2, m1)); AddFeature(features, "is-acronym", CorefRules.EntityIsAcronym(doc, c2, c1)); AddFeature(features, "demonym", m2.IsDemonym(m1, dictionaries)); AddFeature(features, "incompatible-modifier", CorefRules.EntityHaveIncompatibleModifier(m2, m1)); AddFeature(features, "head-lemma-match", m1.headWord.Lemma().Equals(m2.headWord.Lemma())); AddFeature(features, "words-included", CorefRules.EntityWordsIncluded(c2, c1, m2, m1)); AddFeature(features, "extra-proper-noun", CorefRules.EntityHaveExtraProperNoun(m2, m1, new HashSet <string>())); AddFeature(features, "number-in-later-mentions", CorefRules.EntityNumberInLaterMention(m2, m1)); AddFeature(features, "sentence-context-incompatible", CorefRules.SentenceContextIncompatible(m2, m1, dictionaries)); // syntax features if (useConstituencyParse) { if (m1.sentNum == m2.sentNum) { int clauseCount = 0; Tree tree = m2.contextParseTree; Tree current = m2.mentionSubTree; while (true) { current = current.Ancestor(1, tree); if (current.Label().Value().StartsWith("S")) { clauseCount++; } if (current.Dominates(m1.mentionSubTree)) { break; } if (current.Label().Value().Equals("ROOT") || current.Ancestor(1, tree) == null) { break; } } features.IncrementCount("clause-count", clauseCount); features.IncrementCount("clause-count=" + Bin(clauseCount)); } if (RuleBasedCorefMentionFinder.IsPleonastic(m2, m2.contextParseTree) || RuleBasedCorefMentionFinder.IsPleonastic(m1, m1.contextParseTree)) { features.IncrementCount("pleonastic-it"); } if (MaximalNp(m1.mentionSubTree) == MaximalNp(m2.mentionSubTree)) { features.IncrementCount("same-maximal-np"); } bool m1Embedded = HeadEmbeddingLevel(m1.mentionSubTree, m1.headIndex - m1.startIndex) > 1; bool m2Embedded = HeadEmbeddingLevel(m2.mentionSubTree, m2.headIndex - m2.startIndex) > 1; features.IncrementCount("embedding=" + m1Embedded + "_" + m2Embedded); } return(features); }