public virtual IList <IList <Mention> > ExtractGoldMentions(CoNLL2011DocumentReader.Document conllDoc) { IList <ICoreMap> sentences = conllDoc.GetAnnotation().Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <IList <Mention> > allGoldMentions = new List <IList <Mention> >(); CollectionValuedMap <string, ICoreMap> corefChainMap = conllDoc.GetCorefChainMap(); for (int i = 0; i < sentences.Count; i++) { allGoldMentions.Add(new List <Mention>()); } int maxCorefClusterId = -1; foreach (string corefIdStr in corefChainMap.Keys) { int id = System.Convert.ToInt32(corefIdStr); if (id > maxCorefClusterId) { maxCorefClusterId = id; } } int newMentionID = maxCorefClusterId + 1; foreach (KeyValuePair <string, ICollection <ICoreMap> > idChainEntry in corefChainMap) { int id = System.Convert.ToInt32(idChainEntry.Key); int clusterMentionCnt = 0; foreach (ICoreMap m in idChainEntry.Value) { clusterMentionCnt++; Mention mention = new Mention(); mention.goldCorefClusterID = id; if (clusterMentionCnt == 1) { // First mention in cluster mention.mentionID = id; mention.originalRef = -1; } else { mention.mentionID = newMentionID; mention.originalRef = id; newMentionID++; } if (maxID < mention.mentionID) { maxID = mention.mentionID; } int sentIndex = m.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); ICoreMap sent = sentences[sentIndex]; mention.startIndex = m.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) - sent.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); mention.endIndex = m.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - sent.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); // will be set by arrange mention.originalSpan = m.Get(typeof(CoreAnnotations.TokensAnnotation)); // Mention dependency graph is the enhanced dependency graph of the sentence mention.dependency = sentences[sentIndex].Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); allGoldMentions[sentIndex].Add(mention); } } return(allGoldMentions); }
/// <exception cref="System.Exception"/> public override Document NextDoc() { IList <IList <CoreLabel> > allWords = new List <IList <CoreLabel> >(); IList <Tree> allTrees = new List <Tree>(); CoNLL2011DocumentReader.Document conllDoc = reader.GetNextDocument(); if (conllDoc == null) { return(null); } Annotation anno = conllDoc.GetAnnotation(); IList <ICoreMap> sentences = anno.Get(typeof(CoreAnnotations.SentencesAnnotation)); foreach (ICoreMap sentence in sentences) { if (!Constants.UseGoldParses && !replicateCoNLL) { // Remove tree from annotation and replace with parse using stanford parser sentence.Remove(typeof(TreeCoreAnnotations.TreeAnnotation)); } else { Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); treeLemmatizer.TransformTree(tree); // generate the dependency graph try { SemanticGraph deps = SemanticGraphFactory.MakeFromTree(tree, SemanticGraphFactory.Mode.Enhanced, GrammaticalStructure.Extras.None); SemanticGraph basicDeps = SemanticGraphFactory.MakeFromTree(tree, SemanticGraphFactory.Mode.Basic, GrammaticalStructure.Extras.None); sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), basicDeps); sentence.Set(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation), deps); } catch (Exception e) { logger.Log(Level.Warning, "Exception caught during extraction of Stanford dependencies. Will ignore and continue...", e); } } } string preSpeaker = null; int utterance = -1; foreach (CoreLabel token in anno.Get(typeof(CoreAnnotations.TokensAnnotation))) { if (!token.ContainsKey(typeof(CoreAnnotations.SpeakerAnnotation))) { token.Set(typeof(CoreAnnotations.SpeakerAnnotation), string.Empty); } string curSpeaker = token.Get(typeof(CoreAnnotations.SpeakerAnnotation)); if (!curSpeaker.Equals(preSpeaker)) { utterance++; preSpeaker = curSpeaker; } token.Set(typeof(CoreAnnotations.UtteranceAnnotation), utterance); } // Run pipeline stanfordProcessor.Annotate(anno); foreach (ICoreMap sentence_1 in anno.Get(typeof(CoreAnnotations.SentencesAnnotation))) { allWords.Add(sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation))); allTrees.Add(sentence_1.Get(typeof(TreeCoreAnnotations.TreeAnnotation))); } // Initialize gold mentions IList <IList <Mention> > allGoldMentions = ExtractGoldMentions(conllDoc); IList <IList <Mention> > allPredictedMentions; //allPredictedMentions = allGoldMentions; // Make copy of gold mentions since mentions may be later merged, mentionID's changed and stuff allPredictedMentions = mentionFinder.ExtractPredictedMentions(anno, maxID, dictionaries); try { RecallErrors(allGoldMentions, allPredictedMentions, anno); } catch (IOException e) { throw new Exception(e); } Document doc = Arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); doc.conllDoc = conllDoc; return(doc); }