private EntityMention ConvertAceEntityMention(AceEntityMention entityMention, string docId, ICoreMap sentence, int tokenOffset, string corefID) { EntityMention converted = ConvertAceEntityMention(entityMention, docId, sentence, tokenOffset); converted.SetCorefID(corefID); return(converted); }
// nothing to do by default /// <summary> /// Converts NamedEntityTagAnnotation tags into /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/> /// s. This /// finds the longest sequence of NamedEntityTagAnnotation tags of the matching /// type. /// </summary> /// <param name="sentence">A sentence, ideally annotated with NamedEntityTagAnnotation</param> /// <param name="nerTag">The name of the NER tag to copy, e.g. "DATE".</param> /// <param name="entityType"> /// The type of the /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/> /// objects created /// </param> public virtual void MakeAnnotationFromGivenNERTag(ICoreMap sentence, string nerTag, string entityType) { IList <CoreLabel> words = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <EntityMention> mentions = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); System.Diagnostics.Debug.Assert(words != null); System.Diagnostics.Debug.Assert(mentions != null); for (int start = 0; start < words.Count; start++) { int end; // find the first token after start that isn't of nerType for (end = start; end < words.Count; end++) { string ne = words[end].Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if (!ne.Equals(nerTag)) { break; } } if (end > start) { // found a match! EntityMention m = entityMentionFactory.ConstructEntityMention(EntityMention.MakeUniqueId(), sentence, new Span(start, end), new Span(start, end), entityType, null, null); logger.Info("Created " + entityType + " entity mention: " + m); start = end - 1; mentions.Add(m); } } sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), mentions); }
private static Element ToXML(EntityMention entity, string curNS) { Element top = new Element("entity", curNS); top.AddAttribute(new Attribute("id", entity.GetObjectId())); Element type = new Element("type", curNS); type.AppendChild(entity.GetType()); top.AppendChild(entity.GetType()); if (entity.GetNormalizedName() != null) { Element nm = new Element("normalized", curNS); nm.AppendChild(entity.GetNormalizedName()); top.AppendChild(nm); } if (entity.GetSubType() != null) { Element subtype = new Element("subtype", curNS); subtype.AppendChild(entity.GetSubType()); top.AppendChild(subtype); } Element span = new Element("span", curNS); span.AddAttribute(new Attribute("start", int.ToString(entity.GetHeadTokenStart()))); span.AddAttribute(new Attribute("end", int.ToString(entity.GetHeadTokenEnd()))); top.AppendChild(span); top.AppendChild(MakeProbabilitiesElement(entity, curNS)); return(top); }
public virtual EntityMention MakeEntityMention(ICoreMap sentence, int start, int end, string label, string identifier) { Span span = new Span(start, end); string type = null; string subtype = null; if (!label.StartsWith("B-") && !label.StartsWith("I-")) { type = label; subtype = null; } else { // TODO: add support for subtypes! (needed at least in ACE) type = Sharpen.Runtime.Substring(label, 2); subtype = null; } // TODO: add support for subtypes! (needed at least in ACE) EntityMention entity = entityMentionFactory.ConstructEntityMention(identifier, sentence, span, span, type, subtype, null); ICounter <string> probs = new ClassicCounter <string>(); probs.SetCount(entity.GetType(), 1.0); entity.SetTypeProbabilities(probs); return(entity); }
public virtual void MakeEntityMention(ICoreMap sentence, int start, int end, string label, IList <EntityMention> entities, int sentCount) { System.Diagnostics.Debug.Assert((start >= 0)); string identifier = MakeEntityMentionIdentifier(sentence, sentCount, entities.Count); EntityMention entity = MakeEntityMention(sentence, start, end, label, identifier); entities.Add(entity); }
private string MakeLabel(EntityMention m) { string label = m.GetType(); if (useSubTypes && m.GetSubType() != null) { label += "-" + m.GetSubType(); } return(label); }
/// <summary> /// Convert an /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceEntityMention"/> /// to an /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/> /// . /// </summary> /// <param name="entityMention"> /// /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceEntityMention"/> /// to convert /// </param> /// <param name="docId">ID of the document containing this entity mention</param> /// <param name="sentence"/> /// <param name="tokenOffset"> /// An offset in the calculations of position of the extent to sentence boundary /// (the ace.reader stores absolute token offset from the beginning of the document, but /// we need token offsets from the beginning of the sentence => adjust by tokenOffset) /// </param> /// <returns> /// entity as an /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/> /// </returns> private EntityMention ConvertAceEntityMention(AceEntityMention entityMention, string docId, ICoreMap sentence, int tokenOffset) { //log.info("TYPE is " + entityMention.getParent().getType()); //log.info("SUBTYPE is " + entityMention.getParent().getSubtype()); //log.info("LDCTYPE is " + entityMention.getLdctype()); AceCharSeq ext = entityMention.GetExtent(); AceCharSeq head = entityMention.GetHead(); int extStart = ext.GetTokenStart() - tokenOffset; int extEnd = ext.GetTokenEnd() - tokenOffset + 1; if (extStart < 0) { logger.Severe("READER ERROR: Invalid extent start " + extStart + " for entity mention " + entityMention.GetId() + " in document " + docId + " in sentence " + sentence); logger.Severe("This may happen due to incorrect EOS detection. Adjusting entity extent."); extStart = 0; } if (extEnd > sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Count) { logger.Severe("READER ERROR: Invalid extent end " + extEnd + " for entity mention " + entityMention.GetId() + " in document " + docId + " in sentence " + sentence); logger.Severe("This may happen due to incorrect EOS detection. Adjusting entity extent."); extEnd = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Count; } int headStart = head.GetTokenStart() - tokenOffset; int headEnd = head.GetTokenEnd() - tokenOffset + 1; if (headStart < 0) { logger.Severe("READER ERROR: Invalid head start " + headStart + " for entity mention " + entityMention.GetId() + " in document " + docId + " in sentence " + sentence); logger.Severe("This may happen due to incorrect EOS detection. Adjusting entity head span."); headStart = 0; } if (headEnd > sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Count) { logger.Severe("READER ERROR: Invalid head end " + headEnd + " for entity mention " + entityMention.GetId() + " in document " + docId + " in sentence " + sentence); logger.Severe("This may happen due to incorrect EOS detection. Adjusting entity head span."); headEnd = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Count; } // must adjust due to possible incorrect EOS detection if (headStart < extStart) { headStart = extStart; } if (headEnd > extEnd) { headEnd = extEnd; } System.Diagnostics.Debug.Assert((headStart < headEnd)); // note: the ace.reader stores absolute token offset from the beginning of the document, but // we need token offsets from the beginning of the sentence => adjust by tokenOffset // note: in ace.reader the end token position is inclusive, but // in our setup the end token position is exclusive => add 1 to end EntityMention converted = new EntityMention(entityMention.GetId(), sentence, new Span(extStart, extEnd), new Span(headStart, headEnd), entityMention.GetParent().GetType(), entityMention.GetParent().GetSubtype(), entityMention.GetLdctype()); return(converted); }
/* * Sets the head word and the index for an entity, given the parse tree for * the sentence containing the entity. * * This code is no longer used, but I've kept it around (at least for now) as * reference when we modify preProcessSentences(). */ private void SetHeadWord(EntityMention entity, Tree tree) { IList <Tree> leaves = tree.GetLeaves(); Tree argRoot = tree.JoinNode(leaves[entity.GetExtentTokenStart()], leaves[entity.GetExtentTokenEnd()]); Tree headWordNode = argRoot.HeadTerminal(headFinder); int headWordIndex = GetIndexByObjectEquality(leaves, headWordNode); if (StringUtils.IsPunct(leaves[entity.GetExtentTokenEnd()].Label().Value().Trim()) && (headWordIndex >= entity.GetExtentTokenEnd() || headWordIndex < entity.GetExtentTokenStart())) { argRoot = tree.JoinNode(leaves[entity.GetExtentTokenStart()], leaves[entity.GetExtentTokenEnd() - 1]); headWordNode = argRoot.HeadTerminal(headFinder); headWordIndex = GetIndexByObjectEquality(leaves, headWordNode); if (headWordIndex >= entity.GetExtentTokenStart() && headWordIndex <= entity.GetExtentTokenEnd() - 1) { entity.SetHeadTokenPosition(headWordIndex); entity.SetHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1)); } } if (headWordIndex >= entity.GetExtentTokenStart() && headWordIndex <= entity.GetExtentTokenEnd()) { entity.SetHeadTokenPosition(headWordIndex); entity.SetHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1)); } else { // Re-parse the argument words by themselves // Get the list of words in the arg by looking at the leaves between // arg.getExtentTokenStart() and arg.getExtentTokenEnd() inclusive IList <string> argWords = new List <string>(); for (int i = entity.GetExtentTokenStart(); i <= entity.GetExtentTokenEnd(); i++) { argWords.Add(leaves[i].Label().Value()); } if (StringUtils.IsPunct(argWords[argWords.Count - 1])) { argWords.Remove(argWords.Count - 1); } Tree argTree = ParseStrings(argWords); headWordNode = argTree.HeadTerminal(headFinder); headWordIndex = GetIndexByObjectEquality(argTree.GetLeaves(), headWordNode) + entity.GetExtentTokenStart(); entity.SetHeadTokenPosition(headWordIndex); entity.SetHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1)); } }
/// <summary>Find the index of the head of an entity.</summary> /// <param name="ent">The entity mention</param> /// <param name="tree">The Tree for the entire sentence in which it occurs.</param> /// <param name="tokens">The Sentence in which it occurs</param> /// <param name="setHeadSpan">Whether to set the head span in the entity mention.</param> /// <returns>The index of the entity head</returns> public virtual int AssignSyntacticHead(EntityMention ent, Tree tree, IList <CoreLabel> tokens, bool setHeadSpan) { if (ent.GetSyntacticHeadTokenPosition() != -1) { return(ent.GetSyntacticHeadTokenPosition()); } logger.Finest("Finding syntactic head for entity: " + ent + " in tree: " + tree.ToString()); logger.Finest("Flat sentence is: " + tokens); Tree sh = null; try { sh = FindSyntacticHead(ent, tree, tokens); } catch (Exception e) { logger.Severe("WARNING: failed to parse sentence. Will continue with the right-most head heuristic: " + SentenceToString(tokens)); Sharpen.Runtime.PrintStackTrace(e); } int headPos = ent.GetExtentTokenEnd() - 1; if (sh != null) { CoreLabel label = (CoreLabel)sh.Label(); headPos = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)); } else { logger.Fine("WARNING: failed to find syntactic head for entity: " + ent + " in tree: " + tree); logger.Fine("Fallback strategy: will set head to last token in mention: " + tokens[headPos]); } ent.SetHeadTokenPosition(headPos); if (setHeadSpan) { // set the head span to match exactly the syntactic head // this is needed for some corpora where the head span is not given ent.SetHeadTokenSpan(new Span(headPos, headPos + 1)); } return(headPos); }
/// <summary> /// This is the original version of /// <see cref="FindSyntacticHead(Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention, Edu.Stanford.Nlp.Trees.Tree, System.Collections.Generic.IList{E})"/> /// before Chris's modifications. /// There's no good reason to use it except for producing historical results. /// It Finds the syntactic head of the given entity mention. /// </summary> /// <param name="ent">The entity mention</param> /// <param name="root">The Tree for the entire sentence in which it occurs.</param> /// <param name="tokens">The Sentence in which it occurs</param> /// <returns> /// The tree object corresponding to the head. This MUST be a child of root. /// It will be a leaf in the parse tree. /// </returns> public virtual Tree OriginalFindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens) { logger.Fine("Searching for tree matching " + ent); Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd()); // // found an exact match // if (exactMatch != null) { logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch)); return(SafeHead(exactMatch)); } // // no exact match found // in this case, we parse the actual extent of the mention // IList <CoreLabel> extentTokens = new List <CoreLabel>(); for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++) { extentTokens.Add(tokens[i]); } Tree tree = Parse(extentTokens); logger.Fine("No exact match found. Local parse:\n" + tree.PennString()); ConvertToCoreLabels(tree); tree.IndexSpans(ent.GetExtentTokenStart()); Tree extentHead = SafeHead(tree); System.Diagnostics.Debug.Assert((extentHead != null)); // extentHead is a child in the local extent parse tree. we need to find the // corresponding node in the main tree CoreLabel l = (CoreLabel)extentHead.Label(); Tree realHead = FindTreeWithSpan(root, l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), l.Get(typeof(CoreAnnotations.EndIndexAnnotation))); System.Diagnostics.Debug.Assert((realHead != null)); return(realHead); }
/// <summary> /// Converts NamedEntityTagAnnotation tags into /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/> /// s. This /// finds the longest sequence of NamedEntityTagAnnotation tags of the matching /// type. /// </summary> /// <param name="sentence">A sentence annotated with NamedEntityTagAnnotation</param> public virtual void MakeAnnotationFromAllNERTags(ICoreMap sentence) { IList <CoreLabel> words = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <EntityMention> mentions = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); System.Diagnostics.Debug.Assert(words != null); if (mentions == null) { this.logger.Info("mentions are null"); mentions = new List <EntityMention>(); } for (int start = 0; start < words.Count; start++) { int end; // find the first token after start that isn't of nerType string lastneTag = null; string ne = null; for (end = start; end < words.Count; end++) { ne = words[end].Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if (ne.Equals(SeqClassifierFlags.DefaultBackgroundSymbol) || (lastneTag != null && !ne.Equals(lastneTag))) { break; } lastneTag = ne; } if (end > start) { // found a match! string entityType = this.GetEntityTypeForTag(lastneTag); EntityMention m = entityMentionFactory.ConstructEntityMention(EntityMention.MakeUniqueId(), sentence, new Span(start, end), new Span(start, end), entityType, null, null); //TODO: changed entityType in the above sentence to nerTag - Sonal logger.Info("Created " + entityType + " entity mention: " + m); start = end - 1; mentions.Add(m); } } sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), mentions); }
/// <summary> /// Reads in a single ACE*.apf.xml file and convert it to RelationSentence /// objects. /// </summary> /// <remarks> /// Reads in a single ACE*.apf.xml file and convert it to RelationSentence /// objects. However, you probably should call parse() instead. /// </remarks> /// <param name="prefix"> /// prefix of ACE filename to read (e.g. /// "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01" /// ) (no ".apf.xml" extension) /// </param> /// <returns>list of RelationSentence objects</returns> /// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Xml.Sax.SAXException"/> /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/> private IList <ICoreMap> ReadDocument(string prefix, Annotation corpus) { logger.Info("Reading document: " + prefix); IList <ICoreMap> results = new List <ICoreMap>(); AceDocument aceDocument; if (aceVersion.Equals("ACE2004")) { aceDocument = AceDocument.ParseDocument(prefix, false, aceVersion); } else { aceDocument = AceDocument.ParseDocument(prefix, false); } string docId = aceDocument.GetId(); // map entity mention ID strings to their EntityMention counterparts IDictionary <string, EntityMention> entityMentionMap = Generics.NewHashMap(); /* * for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) { * List<AceToken> tokens = aceDocument.getSentence(sentenceIndex); * StringBuffer b = new StringBuffer(); * for(AceToken t: tokens) b.append(t.getLiteral() + " " ); * logger.info("SENTENCE: " + b.toString()); * } */ int tokenOffset = 0; for (int sentenceIndex = 0; sentenceIndex < aceDocument.GetSentenceCount(); sentenceIndex++) { IList <AceToken> tokens = aceDocument.GetSentence(sentenceIndex); IList <CoreLabel> words = new List <CoreLabel>(); StringBuilder textContent = new StringBuilder(); for (int i = 0; i < tokens.Count; i++) { CoreLabel l = new CoreLabel(); l.SetWord(tokens[i].GetLiteral()); l.Set(typeof(CoreAnnotations.ValueAnnotation), l.Word()); l.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), tokens[i].GetByteStart()); l.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), tokens[i].GetByteEnd()); words.Add(l); if (i > 0) { textContent.Append(" "); } textContent.Append(tokens[i].GetLiteral()); } // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer) if (words.Count == 1) { string word = words[0].Word(); if (word.StartsWith("<") && word.EndsWith(">")) { tokenOffset += tokens.Count; continue; } } ICoreMap sentence = new Annotation(textContent.ToString()); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docId); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), words); logger.Info("Reading sentence: \"" + textContent + "\""); IList <AceEntityMention> entityMentions = aceDocument.GetEntityMentions(sentenceIndex); IList <AceRelationMention> relationMentions = aceDocument.GetRelationMentions(sentenceIndex); IList <AceEventMention> eventMentions = aceDocument.GetEventMentions(sentenceIndex); // convert entity mentions foreach (AceEntityMention aceEntityMention in entityMentions) { string corefID = string.Empty; foreach (string entityID in aceDocument.GetKeySetEntities()) { AceEntity e = aceDocument.GetEntity(entityID); if (e.GetMentions().Contains(aceEntityMention)) { corefID = entityID; break; } } EntityMention convertedMention = ConvertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID); // EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset); entityCounts.IncrementCount(convertedMention.GetType()); logger.Info("CONVERTED MENTION HEAD SPAN: " + convertedMention.GetHead()); logger.Info("CONVERTED ENTITY MENTION: " + convertedMention); AnnotationUtils.AddEntityMention(sentence, convertedMention); entityMentionMap[aceEntityMention.GetId()] = convertedMention; } // TODO: make Entity objects as needed // convert relation mentions foreach (AceRelationMention aceRelationMention in relationMentions) { RelationMention convertedMention = ConvertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap); if (convertedMention != null) { relationCounts.IncrementCount(convertedMention.GetType()); logger.Info("CONVERTED RELATION MENTION: " + convertedMention); AnnotationUtils.AddRelationMention(sentence, convertedMention); } } // TODO: make Relation objects // convert EventMentions foreach (AceEventMention aceEventMention in eventMentions) { EventMention convertedMention = ConvertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset); if (convertedMention != null) { eventCounts.IncrementCount(convertedMention.GetType()); logger.Info("CONVERTED EVENT MENTION: " + convertedMention); AnnotationUtils.AddEventMention(sentence, convertedMention); } } // TODO: make Event objects results.Add(sentence); tokenOffset += tokens.Count; } return(results); }
/// <summary>Finds the syntactic head of the given entity mention.</summary> /// <param name="ent">The entity mention</param> /// <param name="root">The Tree for the entire sentence in which it occurs.</param> /// <param name="tokens">The Sentence in which it occurs</param> /// <returns> /// The tree object corresponding to the head. This MUST be a child of root. /// It will be a leaf in the parse tree. /// </returns> public virtual Tree FindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens) { if (!useNewHeadFinder) { return(OriginalFindSyntacticHead(ent, root, tokens)); } logger.Fine("Searching for tree matching " + ent); Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd()); // // found an exact match // if (exactMatch != null) { logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch)); return(SafeHead(exactMatch)); } // no exact match found // in this case, we parse the actual extent of the mention, embedded in a sentence // context, so as to make the parser work better :-) int approximateness = 0; IList <CoreLabel> extentTokens = new List <CoreLabel>(); extentTokens.Add(InitCoreLabel("It")); extentTokens.Add(InitCoreLabel("was")); int AddedWords = 2; for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++) { // Add everything except separated dashes! The separated dashes mess with the parser too badly. CoreLabel label = tokens[i]; if (!"-".Equals(label.Word())) { extentTokens.Add(tokens[i]); } else { approximateness++; } } extentTokens.Add(InitCoreLabel(".")); // constrain the parse to the part we're interested in. // Starting from ADDED_WORDS comes from skipping "It was". // -1 to exclude the period. // We now let it be any kind of nominal constituent, since there // are VP and S ones ParserConstraint constraint = new ParserConstraint(AddedWords, extentTokens.Count - 1, ".*"); IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint); Tree tree = Parse(extentTokens, constraints); logger.Fine("No exact match found. Local parse:\n" + tree.PennString()); ConvertToCoreLabels(tree); tree.IndexSpans(ent.GetExtentTokenStart() - AddedWords); // remember it has ADDED_WORDS extra words at the beginning Tree subtree = FindPartialSpan(tree, ent.GetExtentTokenStart()); Tree extentHead = SafeHead(subtree); logger.Fine("Head is: " + extentHead); System.Diagnostics.Debug.Assert((extentHead != null)); // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree // Because we deleted dashes, it's index will be >= the index in the extent parse tree CoreLabel l = (CoreLabel)extentHead.Label(); // Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class)); Tree realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness); if (realHead != null) { logger.Fine("Chosen head: " + realHead); } return(realHead); }
/// <summary> /// 对section进行命名实体识别 /// </summary> /// <param name="sectionIndex">section在新闻中的位置</param> /// <param name="tokens">section中词--词性pair的集合</param> private void splitEntity(int sectionIndex, string content) { string sentence = ""; int entityFlag = -1; int wordIndex = -1; if (content.Equals("") || content.Equals(" ")) { return; } addMargin();//段首添加缩进 string[] tokens = content.Split(' '); foreach (string wordWithFlag in tokens) { string[] tempArray = wordWithFlag.Split('/'); string word = tempArray[0]; string flag = tempArray[1]; int wordFlag = Array.IndexOf(Const.entityList, flag); if (sentence != "") { if (entityFlag == wordFlag) { sentence += word; wordIndex++; } else { addText(sentence, entityFlag); if (entityFlag >= 0 && entityFlag != 4) //找到entity { var entityMention = new EntityMention(); entityMention.indexInSection = wordIndex; entityMention.indexInNews = wordIndex + sectionIndex; entityMention.newsId = this.newsId; entityMention.value = sentence; entityMentionList.Add(entityMention); if (!entityMap.ContainsKey(sentence)) { var entity = DBHelper.db.Queryable <NamedEntity>().Where(it => it.value == sentence).First(); if (entity == null) //只保存未存入数据库的 { entity = new NamedEntity(); entity.value = sentence; entityMap.Add(sentence, entity); } } } entityFlag = wordFlag; sentence = word; wordIndex++; } } else { entityFlag = wordFlag; sentence = word; wordIndex++; } } if (sentence != "") { addText(sentence, entityFlag); addLineBreak(); } }
private bool TransformAnnotationDocument(Models.Document doc) { string text = doc.RawText; string type = doc.Type; string user = "******"; string todayString = DateTime.Today.ToString("MMddyyyy"); string originalFilename = doc.FileName; //***************************************************************************** // Here we write to file the chosen DocumentSentiment which has one format string sentiment = doc.DocumentSentiment; var newSentimentFilename = Path.ChangeExtension(originalFilename, ".snt"); try { string filePath = null; if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentimentFilename); } else if (ConfigurationManager.AppSettings["environment"] == Release) { filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentimentFilename; } System.IO.FileInfo file = new System.IO.FileInfo(filePath); file.Directory.Create(); using (StreamWriter sentFile = new StreamWriter(file.FullName, false)) { sentFile.WriteLine(sentiment); } } catch (Exception e) { // Don't know what to do in this case } //***************************************************************************** // Here we write to file the chosen Sentence-level sentiment which has different format List <string> senSentiment = doc.SentenceSentiment; List <string> docSentences = doc.Sentences; var newSentenceSentimentFilename = Path.ChangeExtension(originalFilename, ".csv"); try { string filePath = null; if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentenceSentimentFilename); } else if (ConfigurationManager.AppSettings["environment"] == Release) { filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentenceSentimentFilename; } System.IO.FileInfo file = new System.IO.FileInfo(filePath); file.Directory.Create(); using (StreamWriter sentFile = new StreamWriter(file.FullName, false)) { var writer = new CsvWriter(sentFile); writer.Configuration.Delimiter = ","; // Write the header writer.WriteField("Sentiment"); writer.WriteField("Sentence"); writer.NextRecord(); for (int sen = 0; sen < senSentiment.Count; sen++) { var sentence = docSentences[sen]; var senSen = senSentiment[sen]; if (senSen == null) { writer.WriteField("Unknown"); } else { writer.WriteField(senSen); } writer.WriteField(sentence); writer.NextRecord(); } } } catch (Exception e) { // Don't know what to do in this case } //***************************************************************************** // Process the user entered annotations string annotations = doc.Annotations == null || doc.Annotations == "" ? "" : doc.Annotations; List <Annotation> clientAnnotations = JsonConvert.DeserializeObject <List <Annotation> >(annotations); if (clientAnnotations != null) { clientAnnotations.Sort(delegate(Annotation ca1, Annotation ca2) { return(ca1.begin.CompareTo(ca2.begin)); }); } // Here we write to file with the chosen annotation type if (type == "default") { var newFilename = Path.ChangeExtension(originalFilename, ".ann"); List <EntityMention> ems = new List <EntityMention>(); if (clientAnnotations != null) { foreach (Annotation clientAnnotation in clientAnnotations) { EntityMention em = new EntityMention(); em.begin = clientAnnotation.begin; em.end = clientAnnotation.end; em.type = clientAnnotation.type; em.text = text.Substring(clientAnnotation.begin, clientAnnotation.end - clientAnnotation.begin); ems.Add(em); } } try { string filePath = null; if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename); } if (ConfigurationManager.AppSettings["environment"] == Release) { filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename; } System.IO.FileInfo file = new System.IO.FileInfo(filePath); file.Directory.Create(); using (StreamWriter annFile = new StreamWriter(file.FullName, false)) { annFile.WriteLine("###THIS IS A COMMENT BLOCK###"); annFile.WriteLine("###FORMAT: " + type + " ###"); foreach (EntityMention em in ems) { annFile.WriteLine(em); } } return(true); } catch (Exception e) { return(false); } } else if (type == "xml") { var newFilename = Path.ChangeExtension(originalFilename, ".xml"); string fulltext = ""; int currentLocation = 0; if (clientAnnotations != null) { foreach (Annotation clientAnnotation in clientAnnotations) { int begin = clientAnnotation.begin; int end = clientAnnotation.end; string entityType = clientAnnotation.type; fulltext += text.Substring(currentLocation, begin - currentLocation); fulltext += "<" + entityType + ">"; fulltext += text.Substring(begin, end - begin); fulltext += "</" + entityType + ">"; currentLocation = end; } fulltext += text.Substring(currentLocation); } try { string filePath = null; if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename); } if (ConfigurationManager.AppSettings["environment"] == Release) { filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename; } System.IO.FileInfo file = new System.IO.FileInfo(filePath); file.Directory.Create(); using (StreamWriter xmlFile = new StreamWriter(file.FullName, false)) { xmlFile.WriteLine("###THIS IS A COMMENT BLOCK###"); xmlFile.WriteLine("###FORMAT: " + type + " ###"); xmlFile.WriteLine(fulltext); } } catch (Exception e) { return(false); } return(true); } else if (type == "stanford") { var newFilename = Path.ChangeExtension(originalFilename, ".conll"); string fulltext = ""; int clientAnnotationNumber = 0; int clientAnnotationSize = 0; Annotation clientAnnotation = null; int clientAnnotationBegin = Int32.MaxValue; int clientAnnotationEnd = Int32.MaxValue; string clientAnnotationType = ""; if (clientAnnotations != null && clientAnnotations.Count > 0) { clientAnnotationSize = clientAnnotations.Count; clientAnnotation = clientAnnotations[0]; clientAnnotationBegin = clientAnnotation.begin; clientAnnotationEnd = clientAnnotation.end; clientAnnotationType = clientAnnotation.type; } edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(text); PipelineDispenser.StanfordPipeline.annotate(document); List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation))); foreach (CoreMap sentence in sentences) { List <CoreLabel> tokens = JavaExtensions.ToList <CoreLabel>((java.util.List)document.get(typeof(TokensAnnotation))); foreach (CoreLabel token in tokens) { int tokenBegin = token.beginPosition(); int tokenEnd = token.endPosition(); string chosenNer = "O"; if (isContainedIn(tokenBegin, tokenEnd, clientAnnotationBegin, clientAnnotationEnd)) { chosenNer = clientAnnotationType; if (tokenEnd == clientAnnotationEnd) { clientAnnotationNumber++; if (clientAnnotationNumber < clientAnnotationSize) { clientAnnotation = clientAnnotations[clientAnnotationNumber]; clientAnnotationBegin = clientAnnotation.begin; clientAnnotationEnd = clientAnnotation.end; clientAnnotationType = clientAnnotation.type; } } } fulltext += (token.value() + " " + chosenNer + Environment.NewLine); } fulltext += Environment.NewLine; } try { string filePath = null; if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename); } if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename; } System.IO.FileInfo file = new System.IO.FileInfo(filePath); file.Directory.Create(); using (StreamWriter conllFile = new StreamWriter(file.FullName, false)) { conllFile.WriteLine("###THIS IS A COMMENT BLOCK###"); conllFile.WriteLine("###FORMAT: " + type + " ###"); conllFile.WriteLine(fulltext); } } catch (Exception e) { return(false); } return(true); } else if (type == "luis") { var newFilename = Path.ChangeExtension(originalFilename, ".lou"); string fulltext = ""; if (clientAnnotations != null) { foreach (Annotation clientAnnotation in clientAnnotations) { EntityMention em = new EntityMention(); em.begin = clientAnnotation.begin; em.end = clientAnnotation.end - 1; em.type = clientAnnotation.type; fulltext += ( "{" + "\"entity\": \"" + em.type + "\", \"startPos\": " + em.begin + ", \"endPos\": " + em.end + "}," + "\n" ); } } try { string filePath = null; if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename); } if (ConfigurationManager.AppSettings["environment"] == Release) { filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename; } System.IO.FileInfo file = new System.IO.FileInfo(filePath); file.Directory.Create(); using (StreamWriter annFile = new StreamWriter(file.FullName, false)) { annFile.WriteLine("###THIS IS A COMMENT BLOCK###"); annFile.WriteLine("###FORMAT: " + type + " ###"); annFile.WriteLine(fulltext); } return(true); } catch (Exception e) { return(false); } } else { return(false); } }
public string OpenFile(string location) { List <object> annotationParts = new List <object>(); // First, read sentiment file. It is ok for it to not exist try { var sentimentLocation = Path.ChangeExtension(location, ".snt"); string[] sentimentLines = File.ReadAllLines(sentimentLocation); string sentiment = sentimentLines[0]; annotationParts.Add(sentiment); } catch (Exception e) { annotationParts.Add("UNK"); } // Second read the ann file //=========================================== // Read configuration for entity types first List <EntityType> entityTypes = fetchEntityTypesFromConfiguration(); try { string[] allLines = File.ReadAllLines(location); string formatLine = allLines[1]; var formatPattern = @"###FORMAT: ([A-Za-z]+) ###"; var match = Regex.Match(formatLine, formatPattern); string format = match.Groups[1].Value; if (format == "default") { List <Annotation> annotations = new List <Annotation>(); for (int index = 2; index < allLines.Length; index++) { EntityMention em = EntityMention.FromString(allLines[index]); Annotation ann = new Annotation(); ann.begin = em.begin; ann.end = em.end; ann.type = em.type; ann.color = colorDict[ann.type]; annotations.Add(ann); } annotationParts.Add(annotations); return(JsonConvert.SerializeObject(annotationParts)); } else if (format == "xml") { List <Annotation> annotations = new List <Annotation>(); string fulltext = String.Concat(new List <string>(allLines).GetRange(2, allLines.Length).ToArray()); var inlinePattern = BuildEntityTypePattern(entityTypes); Regex inlineRegex = new Regex(inlinePattern); int xmlJunkOffset = 0; foreach (Match inlineMatch in inlineRegex.Matches(fulltext)) { string text = match.Value; int begin = match.Index; int end = inlineMatch.Length - begin; Annotation ann = new Annotation(); ann.begin = begin - xmlJunkOffset; ann.end = end - xmlJunkOffset; SetAnnotationType(ann, entityTypes, text); ann.color = colorDict[ann.type]; } annotationParts.Add(annotations); return(JsonConvert.SerializeObject(annotationParts)); } else { return(null); } } catch (Exception e) { return(null); } }
private Annotation ReadSentence(string docId, IEnumerator <string> lineIterator) { Annotation sentence = new Annotation(string.Empty); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docId); sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), new List <EntityMention>()); // we'll need to set things like the tokens and textContent after we've // fully read the sentence // contains the full text that we've read so far StringBuilder textContent = new StringBuilder(); int tokenCount = 0; // how many tokens we've seen so far IList <CoreLabel> tokens = new List <CoreLabel>(); // when we've seen two blank lines in a row, this sentence is over (one // blank line separates the sentence and the relations int numBlankLinesSeen = 0; string sentenceID = null; // keeps tracks of entities we've seen so far for use by relations IDictionary <string, EntityMention> indexToEntityMention = new Dictionary <string, EntityMention>(); while (lineIterator.MoveNext() && numBlankLinesSeen < 2) { string currentLine = lineIterator.Current; currentLine = currentLine.Replace("COMMA", ","); IList <string> pieces = StringUtils.Split(currentLine); string identifier; int size = pieces.Count; switch (size) { case 1: { // blank line between sentences or relations numBlankLinesSeen++; break; } case 3: { // relation string type = pieces[2]; IList <ExtractionObject> args = new List <ExtractionObject>(); EntityMention entity1 = indexToEntityMention[pieces[0]]; EntityMention entity2 = indexToEntityMention[pieces[1]]; args.Add(entity1); args.Add(entity2); Span span = new Span(entity1.GetExtentTokenStart(), entity2.GetExtentTokenEnd()); // identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size(); identifier = RelationMention.MakeUniqueId(); RelationMention relationMention = new RelationMention(identifier, sentence, span, type, null, args); AnnotationUtils.AddRelationMention(sentence, relationMention); break; } case 9: { // token /* * Roth token lines look like this: * * 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O */ // Entities may be multiple words joined by '/'; we split these up IList <string> words = StringUtils.Split(pieces[5], "/"); //List<String> postags = StringUtils.split(pieces.get(4),"/"); string text = StringUtils.Join(words, " "); identifier = "entity" + pieces[0] + '-' + pieces[2]; string nerTag = GetNormalizedNERTag(pieces[1]); // entity type of the word/expression if (sentenceID == null) { sentenceID = pieces[0]; } if (!nerTag.Equals("O")) { Span extentSpan = new Span(tokenCount, tokenCount + words.Count); // Temporarily sets the head span to equal the extent span. // This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called. // The head span is later modified if preprocessSentences is called. EntityMention entity = new EntityMention(identifier, sentence, extentSpan, extentSpan, nerTag, null, null); AnnotationUtils.AddEntityMention(sentence, entity); // we can get by using these indices as strings since we only use them // as a hash key string index = pieces[2]; indexToEntityMention[index] = entity; } // int i =0; foreach (string word in words) { CoreLabel label = new CoreLabel(); label.SetWord(word); //label.setTag(postags.get(i)); label.Set(typeof(CoreAnnotations.TextAnnotation), word); label.Set(typeof(CoreAnnotations.ValueAnnotation), word); // we don't set TokenBeginAnnotation or TokenEndAnnotation since we're // not keeping track of character offsets tokens.Add(label); } // i++; textContent.Append(text); textContent.Append(' '); tokenCount += words.Count; break; } } } sentence.Set(typeof(CoreAnnotations.TextAnnotation), textContent.ToString()); sentence.Set(typeof(CoreAnnotations.ValueAnnotation), textContent.ToString()); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); sentence.Set(typeof(CoreAnnotations.SentenceIDAnnotation), sentenceID); return(sentence); }