/// <exception cref="System.TypeLoadException"/> /// <exception cref="System.IO.IOException"/> public virtual void LoadSemantics(Properties props) { log.Info("LOADING SEMANTICS"); // wordnet = new WordNet(); // load word vector if (HybridCorefProperties.LoadWordEmbedding(props)) { log.Info("LOAD: WordVectors"); string wordvectorFile = HybridCorefProperties.GetPathSerializedWordVectors(props); string word2vecFile = HybridCorefProperties.GetPathWord2Vec(props); try { // Try to read the serialized vectors vectors = VectorMap.Deserialize(wordvectorFile); } catch (IOException e) { // If that fails, try to read the vectors from the word2vec file if (new File(word2vecFile).Exists()) { vectors = VectorMap.ReadWord2Vec(word2vecFile); if (wordvectorFile != null && !wordvectorFile.StartsWith("edu")) { vectors.Serialize(wordvectorFile); } } else { // If that fails, give up and crash throw new RuntimeIOException(e); } } dimVector = vectors.GetEnumerator().Current.Value.Length; } }
// load sieve (from file or make a deterministic sieve) /// <exception cref="System.Exception"/> public static Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve LoadSieve(Properties props, string sievename) { switch (HybridCorefProperties.GetClassifierType(props, sievename)) { case Sieve.ClassifierType.Rule: { // log.info("Loading sieve: "+sievename+" ..."); DeterministicCorefSieve sieve = (DeterministicCorefSieve)Sharpen.Runtime.GetType("edu.stanford.nlp.coref.hybrid.sieve." + sievename).GetConstructor().NewInstance(); sieve.props = props; sieve.lang = HybridCorefProperties.GetLanguage(props); return(sieve); } case Sieve.ClassifierType.Rf: { log.Info("Loading sieve: " + sievename + " from " + HybridCorefProperties.GetPathModel(props, sievename) + " ... "); RFSieve rfsieve = IOUtils.ReadObjectFromURLOrClasspathOrFileSystem(HybridCorefProperties.GetPathModel(props, sievename)); rfsieve.thresMerge = HybridCorefProperties.GetMergeThreshold(props, sievename); log.Info("done. Merging threshold: " + rfsieve.thresMerge); return(rfsieve); } case Sieve.ClassifierType.Oracle: { OracleSieve oracleSieve = new OracleSieve(props, sievename); oracleSieve.props = props; return(oracleSieve); } default: { throw new Exception("no sieve type specified"); } } }
public Sieve(Properties props, string sievename) { this.lang = HybridCorefProperties.GetLanguage(props); this.sievename = sievename; this.aType = HybridCorefProperties.GetAntecedentType(props, sievename); this.mType = HybridCorefProperties.GetMentionType(props, sievename); this.maxSentDist = HybridCorefProperties.GetMaxSentDistForSieve(props, sievename); this.mTypeStr = HybridCorefProperties.GetMentionTypeStr(props, sievename); this.aTypeStr = HybridCorefProperties.GetAntecedentTypeStr(props, sievename); }
protected internal static bool SkipForAnalysis(Mention ant, Mention m, Properties props) { if (!HybridCorefProperties.DoAnalysis(props)) { return(false); } string skipMentionType = HybridCorefProperties.GetSkipMentionType(props); string skipAntType = HybridCorefProperties.GetSkipAntecedentType(props); return(MatchedMentionType(ant, skipAntType) && MatchedMentionType(m, skipMentionType)); }
/// <exception cref="System.Exception"/> public static IList <Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve> LoadSieves(Properties props) { IList <Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve> sieves = new List <Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve>(); string sieveProp = HybridCorefProperties.GetSieves(props); string currentSieveForTrain = HybridCorefProperties.GetCurrentSieveForTrain(props); string[] sievenames = (currentSieveForTrain == null) ? sieveProp.Trim().Split(",\\s*") : sieveProp.Split(currentSieveForTrain)[0].Trim().Split(",\\s*"); foreach (string sievename in sievenames) { Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve sieve = LoadSieve(props, sievename); sieves.Add(sieve); } return(sieves); }
/// <exception cref="System.Exception"/> public virtual string ResolveMention(Document document, Dictionaries dict, Properties props) { StringBuilder sbLog = new StringBuilder(); if (HybridCorefProperties.Debug(props)) { sbLog.Append("======================================================="); sbLog.Append(HybridCorefPrinter.PrintRawDoc(document, true, true)); } foreach (IList <Mention> mentionsInSent in document.predictedMentions) { for (int mIdx = 0; mIdx < mentionsInSent.Count; mIdx++) { Mention m = mentionsInSent[mIdx]; if (SkipMentionType(m, props)) { continue; } FindCoreferentAntecedent(m, mIdx, document, dict, props, sbLog); } } return(sbLog.ToString()); }
/// <exception cref="System.Exception"/> public override void FindCoreferentAntecedent(Mention m, int mIdx, Document document, Dictionaries dict, Properties props, StringBuilder sbLog) { // check for skip: first mention only, discourse salience if (!this.flags.UseSpeakermatch && !this.flags.UseDiscoursematch && !this.flags.UseApposition && !this.flags.UsePredicatenominatives && this.SkipThisMention(document, m, document.corefClusters[m.corefClusterID], dict)) { return; } ICollection <Mention> roleSet = document.roleSet; for (int sentJ = m.sentNum; sentJ >= 0; sentJ--) { IList <Mention> l = Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve.GetOrderedAntecedents(m, sentJ, mIdx, document.predictedMentions, dict); if (maxSentDist != -1 && m.sentNum - sentJ > maxSentDist) { continue; } // TODO: do we need this? // Sort mentions by length whenever we have two mentions beginning at the same position and having the same head for (int i = 0; i < l.Count; i++) { for (int j = 0; j < l.Count; j++) { if (l[i].headString.Equals(l[j].headString) && l[i].startIndex == l[j].startIndex && l[i].SameSentence(l[j]) && j > i && l[i].SpanToString().Length > l[j].SpanToString().Length) { l.Set(j, l.Set(i, l[j])); } } } // log.info("antecedent ordering changed!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); foreach (Mention ant in l) { if (SkipForAnalysis(ant, m, props)) { continue; } // m2 - antecedent of m1 // Skip singletons according to the singleton predictor // (only for non-NE mentions) // Recasens, de Marneffe, and Potts (NAACL 2013) if (m.isSingleton && m.mentionType != Dictionaries.MentionType.Proper && ant.isSingleton && ant.mentionType != Dictionaries.MentionType.Proper) { continue; } if (m.corefClusterID == ant.corefClusterID) { continue; } if (!mType.Contains(m.mentionType) || !aType.Contains(ant.mentionType)) { continue; } if (m.mentionType == Dictionaries.MentionType.Pronominal) { if (!MatchedMentionType(m, mTypeStr)) { continue; } if (!MatchedMentionType(ant, aTypeStr)) { continue; } } CorefCluster c1 = document.corefClusters[m.corefClusterID]; CorefCluster c2 = document.corefClusters[ant.corefClusterID]; System.Diagnostics.Debug.Assert((c1 != null)); System.Diagnostics.Debug.Assert((c2 != null)); if (this.UseRoleSkip()) { if (m.IsRoleAppositive(ant, dict)) { roleSet.Add(m); } else { if (ant.IsRoleAppositive(m, dict)) { roleSet.Add(ant); } } continue; } if (this.Coreferent(document, c1, c2, m, ant, dict, roleSet)) { // print logs for analysis // if (doScore()) { // printLogs(c1, c2, m1, m2, document, currentSieve); // } // print dcoref log if (HybridCorefProperties.Debug(props)) { sbLog.Append(HybridCorefPrinter.PrintErrorLogDcoref(m, ant, document, dict, mIdx, this.GetType().FullName)); } int removeID = c1.clusterID; // log.info("Merging ant "+c2+" with "+c1); CorefCluster.MergeClusters(c2, c1); document.MergeIncompatibles(c2, c1); document.MergeAcronymCache(c2, c1); // log.warning("Removing cluster " + removeID + ", merged with " + c2.getClusterID()); Sharpen.Collections.Remove(document.corefClusters, removeID); return; } } } }
/// <summary>Checks if two clusters are coreferent according to our sieve pass constraints</summary> /// <param name="document"/> /// <exception cref="System.Exception"/> public virtual bool Coreferent(Document document, CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention mention2, Mention ant, Dictionaries dict, ICollection <Mention> roleSet) { bool ret = false; Mention mention = mentionCluster.GetRepresentativeMention(); if (flags.UseIncompatibles) { // Check our list of incompatible mentions and don't cluster them together // Allows definite no's from previous sieves to propagate down if (document.IsIncompatible(mentionCluster, potentialAntecedent)) { return(false); } } if (flags.DoPronoun && Math.Abs(mention2.sentNum - ant.sentNum) > 3 && mention2.person != Dictionaries.Person.I && mention2.person != Dictionaries.Person.You) { return(false); } if (mention2.LowercaseNormalizedSpanString().Equals("this") && Math.Abs(mention2.sentNum - ant.sentNum) > 3) { return(false); } if (mention2.person == Dictionaries.Person.You && document.docType == Document.DocType.Article && mention2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0")) { return(false); } if (document.conllDoc != null) { if (ant.generic && ant.person == Dictionaries.Person.You) { return(false); } if (mention2.generic) { return(false); } } // chinese newswire contains coref nested NPs with shared headword Chen & Ng if (lang != Locale.Chinese || document.docInfo == null || !document.docInfo.GetOrDefault("DOC_ID", string.Empty).Contains("nw")) { if (mention2.InsideIn(ant) || ant.InsideIn(mention2)) { return(false); } } if (flags.UseSpeakermatch) { string mSpeaker = mention2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)); string aSpeaker = ant.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)); // <I> from same speaker if (mention2.person == Dictionaries.Person.I && ant.person == Dictionaries.Person.I) { return(mSpeaker.Equals(aSpeaker)); } // <I> - speaker if ((mention2.person == Dictionaries.Person.I && mSpeaker.Equals(int.ToString(ant.mentionID))) || (ant.person == Dictionaries.Person.I && aSpeaker.Equals(int.ToString(mention2.mentionID)))) { return(true); } } if (flags.UseDiscoursematch) { string mString = mention.LowercaseNormalizedSpanString(); string antString = ant.LowercaseNormalizedSpanString(); // mention and ant both belong to the same speaker cluster if (mention.speakerInfo != null && mention.speakerInfo == ant.speakerInfo) { return(true); } // (I - I) in the same speaker's quotation. if (mention.number == Dictionaries.Number.Singular && dict.firstPersonPronouns.Contains(mString) && ant.number == Dictionaries.Number.Singular && dict.firstPersonPronouns.Contains(antString) && CorefRules.EntitySameSpeaker(document, mention, ant)) { return(true); } // (speaker - I) if ((mention.number == Dictionaries.Number.Singular && dict.firstPersonPronouns.Contains(mString)) && CorefRules.AntecedentIsMentionSpeaker(document, mention, ant, dict)) { if (mention.speakerInfo == null && ant.speakerInfo != null) { mention.speakerInfo = ant.speakerInfo; } return(true); } // (I - speaker) if ((ant.number == Dictionaries.Number.Singular && dict.firstPersonPronouns.Contains(antString)) && CorefRules.AntecedentIsMentionSpeaker(document, ant, mention, dict)) { if (ant.speakerInfo == null && mention.speakerInfo != null) { ant.speakerInfo = mention.speakerInfo; } return(true); } // Can be iffy if more than two speakers... but still should be okay most of the time if (dict.secondPersonPronouns.Contains(mString) && dict.secondPersonPronouns.Contains(antString) && CorefRules.EntitySameSpeaker(document, mention, ant)) { return(true); } // previous I - you or previous you - I in two person conversation if (((mention.person == Dictionaries.Person.I && ant.person == Dictionaries.Person.You || (mention.person == Dictionaries.Person.You && ant.person == Dictionaries.Person.I)) && (mention.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation )) - ant.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) == 1) && document.docType == Document.DocType.Conversation)) { return(true); } if (dict.reflexivePronouns.Contains(mention.headString) && CorefRules.EntitySubjectObject(mention, ant)) { return(true); } } if (!flags.UseExactstringmatch && !flags.UseRelaxedExactstringmatch && !flags.UseApposition && !flags.UseWordsInclusion) { foreach (Mention m in mentionCluster.GetCorefMentions()) { foreach (Mention a in potentialAntecedent.GetCorefMentions()) { // angelx - not sure about the logic here, disable (code was also refactored from original) // vv gabor - re-enabled code (seems to improve performance) vv if (m.person != Dictionaries.Person.I && a.person != Dictionaries.Person.I && (CorefRules.AntecedentIsMentionSpeaker(document, m, a, dict) || CorefRules.AntecedentIsMentionSpeaker(document, a, m, dict))) { document.AddIncompatible(m, a); return(false); } // ^^ end block of code in question ^^ int dist = Math.Abs(m.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) - a.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation))); if (document.docType != Document.DocType.Article && dist == 1 && !CorefRules.EntitySameSpeaker(document, m, a)) { string mSpeaker = document.speakers[m.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation))]; string aSpeaker = document.speakers[a.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation))]; if (m.person == Dictionaries.Person.I && a.person == Dictionaries.Person.I) { document.AddIncompatible(m, a); return(false); } if (m.person == Dictionaries.Person.You && a.person == Dictionaries.Person.You) { document.AddIncompatible(m, a); return(false); } // This is weak since we can refer to both speakers if (m.person == Dictionaries.Person.We && a.person == Dictionaries.Person.We) { document.AddIncompatible(m, a); return(false); } } } } if (document.docType == Document.DocType.Article) { foreach (Mention m_1 in mentionCluster.GetCorefMentions()) { foreach (Mention a in potentialAntecedent.GetCorefMentions()) { if (CorefRules.EntitySubjectObject(m_1, a)) { document.AddIncompatible(m_1, a); return(false); } } } } } // Incompatibility constraints - do before match checks if (flags.USE_iwithini && CorefRules.EntityIWithinI(mention, ant, dict)) { document.AddIncompatible(mention, ant); return(false); } // Match checks if (flags.UseExactstringmatch && CorefRules.EntityExactStringMatch(mention, ant, dict, roleSet)) { return(true); } // if(flags.USE_EXACTSTRINGMATCH && Rules.entityExactStringMatch(mentionCluster, potentialAntecedent, dict, roleSet)){ // return true; // } if (flags.UseNameMatch && CheckEntityMatch(document, mentionCluster, potentialAntecedent, dict, roleSet)) { ret = true; } if (flags.UseRelaxedExactstringmatch && CorefRules.EntityRelaxedExactStringMatch(mentionCluster, potentialAntecedent, mention, ant, dict, roleSet)) { return(true); } if (flags.UseApposition && CorefRules.EntityIsApposition(mentionCluster, potentialAntecedent, mention, ant)) { return(true); } if (flags.UsePredicatenominatives && CorefRules.EntityIsPredicateNominatives(mentionCluster, potentialAntecedent, mention, ant)) { return(true); } if (flags.UseAcronym && CorefRules.EntityIsAcronym(document, mentionCluster, potentialAntecedent)) { return(true); } if (flags.UseRelativepronoun && CorefRules.EntityIsRelativePronoun(mention, ant)) { return(true); } if (flags.UseDemonym && mention.IsDemonym(ant, dict)) { return(true); } if (flags.UseRoleapposition) { if (lang == Locale.Chinese) { ret = false; } else { if (CorefRules.EntityIsRoleAppositive(mentionCluster, potentialAntecedent, mention, ant, dict)) { ret = true; } } } if (flags.UseInclusionHeadmatch && CorefRules.EntityHeadsAgree(mentionCluster, potentialAntecedent, mention, ant, dict)) { ret = true; } if (flags.UseRelaxedHeadmatch && CorefRules.EntityRelaxedHeadsAgreeBetweenMentions(mentionCluster, potentialAntecedent, mention, ant)) { ret = true; } if (flags.UseWordsInclusion && ret && !CorefRules.EntityWordsIncluded(mentionCluster, potentialAntecedent, mention, ant)) { return(false); } if (flags.UseIncompatibleModifier && ret && CorefRules.EntityHaveIncompatibleModifier(mentionCluster, potentialAntecedent)) { return(false); } if (flags.UseProperheadAtLast && ret && !CorefRules.EntitySameProperHeadLastWord(mentionCluster, potentialAntecedent, mention, ant)) { return(false); } if (flags.UseAttributesAgree && !CorefRules.EntityAttributesAgree(mentionCluster, potentialAntecedent)) { return(false); } if (flags.UseDifferentLocation && CorefRules.EntityHaveDifferentLocation(mention, ant, dict)) { if (flags.UseProperheadAtLast && ret && mention.goldCorefClusterID != ant.goldCorefClusterID) { } return(false); } if (flags.UseNumberInMention && CorefRules.EntityNumberInLaterMention(mention, ant)) { if (flags.UseProperheadAtLast && ret && mention.goldCorefClusterID != ant.goldCorefClusterID) { } return(false); } if (flags.UseDistance && CorefRules.EntityTokenDistance(mention2, ant)) { return(false); } if (flags.UseCorefDict) { // Head match if (ant.headWord.Lemma().Equals(mention2.headWord.Lemma())) { return(false); } // Constraint: ignore pairs commonNoun - properNoun if (ant.mentionType != Dictionaries.MentionType.Proper && (mention2.headWord.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)).StartsWith("NNP") || !Sharpen.Runtime.Substring(mention2.headWord.Word(), 1).Equals(Sharpen.Runtime.Substring(mention2 .headWord.Word(), 1).ToLower()))) { return(false); } // Constraint: ignore plurals if (ant.headWord.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)).Equals("NNS") && mention2.headWord.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)).Equals("NNS")) { return(false); } // Constraint: ignore mentions with indefinite determiners if (dict.indefinitePronouns.Contains(ant.originalSpan[0].Lemma()) || dict.indefinitePronouns.Contains(mention2.originalSpan[0].Lemma())) { return(false); } // Constraint: ignore coordinated mentions if (ant.IsCoordinated() || mention2.IsCoordinated()) { return(false); } // Constraint: context incompatibility if (CorefRules.ContextIncompatible(mention2, ant, dict)) { return(false); } // Constraint: sentence context incompatibility when the mentions are common nouns if (CorefRules.SentenceContextIncompatible(mention2, ant, dict)) { return(false); } if (CorefRules.EntityClusterAllCorefDictionary(mentionCluster, potentialAntecedent, dict, 1, 8)) { return(true); } if (CorefRules.EntityCorefDictionary(mention, ant, dict, 2, 2)) { return(true); } if (CorefRules.EntityCorefDictionary(mention, ant, dict, 3, 2)) { return(true); } if (CorefRules.EntityCorefDictionary(mention, ant, dict, 4, 2)) { return(true); } } if (flags.DoPronoun) { Mention m; if (mention.predicateNominatives != null && mention.predicateNominatives.Contains(mention2)) { m = mention2; } else { m = mention; } bool mIsPronoun = (m.IsPronominal() || dict.allPronouns.Contains(m.ToString())); bool attrAgree = HybridCorefProperties.UseDefaultPronounAgreement(props) ? CorefRules.EntityAttributesAgree(mentionCluster, potentialAntecedent) : CorefRules.EntityAttributesAgree(mentionCluster, potentialAntecedent, lang); if (mIsPronoun && attrAgree) { if (dict.demonymSet.Contains(ant.LowercaseNormalizedSpanString()) && dict.notOrganizationPRP.Contains(m.headString)) { document.AddIncompatible(m, ant); return(false); } if (CorefRules.EntityPersonDisagree(document, mentionCluster, potentialAntecedent, dict)) { document.AddIncompatible(m, ant); return(false); } return(true); } } if (flags.UseChineseHeadMatch) { if (mention2.headWord == ant.headWord && mention2.InsideIn(ant)) { if (!document.IsCoref(mention2, ant)) { } // TODO: exclude conjunction // log.info("error in chinese head match: "+mention2.spanToString()+"\t"+ant.spanToString()); return(true); } } return(ret); }
/// <exception cref="System.TypeLoadException"/> /// <exception cref="System.IO.IOException"/> public Dictionaries(Properties props) : this(props.GetProperty(HybridCorefProperties.LangProp, HybridCorefProperties.LanguageDefault.ToLanguageTag()), props.GetProperty(HybridCorefProperties.DemonymProp, DefaultPaths.DefaultDcorefDemonym), props.GetProperty(HybridCorefProperties .AnimateProp, DefaultPaths.DefaultDcorefAnimate), props.GetProperty(HybridCorefProperties.InanimateProp, DefaultPaths.DefaultDcorefInanimate), props.GetProperty(HybridCorefProperties.MaleProp), props.GetProperty(HybridCorefProperties.NeutralProp ), props.GetProperty(HybridCorefProperties.FemaleProp), props.GetProperty(HybridCorefProperties.PluralProp), props.GetProperty(HybridCorefProperties.SingularProp), props.GetProperty(HybridCorefProperties.StatesProp, DefaultPaths.DefaultDcorefStates ), props.GetProperty(HybridCorefProperties.GenderNumberProp, HybridCorefProperties.GetGenderNumber(props)), props.GetProperty(HybridCorefProperties.CountriesProp, DefaultPaths.DefaultDcorefCountries), props.GetProperty(HybridCorefProperties .StatesProvincesProp, DefaultPaths.DefaultDcorefStatesAndProvinces), HybridCorefProperties.GetSieves(props).Contains("CorefDictionaryMatch"), PropertiesUtils.GetStringArray(props, HybridCorefProperties.DictListProp, new string[] { DefaultPaths .DefaultDcorefDict1, DefaultPaths.DefaultDcorefDict2, DefaultPaths.DefaultDcorefDict3, DefaultPaths.DefaultDcorefDict4 }), props.GetProperty(HybridCorefProperties.DictPmiProp, DefaultPaths.DefaultDcorefDict1), props.GetProperty(HybridCorefProperties .SignaturesProp, DefaultPaths.DefaultDcorefNeSignatures)) { // if(Boolean.parseBoolean(props.getProperty("useValDictionary"))) { // log.info("LOAD: ValDictionary"); // for(String line : IOUtils.readLines(valDict)) { // String[] split = line.toLowerCase().split("\t"); // strToEntity.put(split[0], split[2]); // dictScore.setCount(split[0], Double.parseDouble(split[1])); // } // } /*if(CorefProperties.useSemantics(props)) { * loadSemantics(props); * } else { * log.info("SEMANTICS NOT LOADED"); * }*/ if (props.Contains("coref.zh.dict")) { LoadChineseGenderNumberAnimacy(props.GetProperty("coref.zh.dict")); } }