private void AnnotateTokens <Token>(IList <TOKEN> tokens) where Token : CoreLabel { // Make a copy of the tokens before annotating because QuantifiableEntityNormalizer may change the POS too IList <CoreLabel> words = new List <CoreLabel>(); foreach (CoreLabel token in tokens) { CoreLabel word = new CoreLabel(); word.SetWord(token.Word()); word.SetNER(token.Ner()); word.SetTag(token.Tag()); // copy fields potentially set by SUTime NumberSequenceClassifier.TransferAnnotations(token, word); words.Add(word); } DoOneSentence(words); // TODO: If collapsed is set, tokens for entities are collapsed into one node then // (words.size() != tokens.size() and the logic below just don't work!!! for (int i = 0; i < words.Count; i++) { string ner = words[i].Ner(); tokens[i].SetNER(ner); tokens[i].Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), words[i].Get(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation))); } }
private CoreLabel MakeXmlToken(string tokenText, bool doNormalization, int charOffsetBegin, int charOffsetEnd) { CoreLabel token = new CoreLabel(); token.SetOriginalText(tokenText); if (separatorPattern.Matcher(tokenText).Matches()) { // Map to CoreNLP newline token tokenText = AbstractTokenizer.NewlineToken; } else { if (doNormalization && normalizeSpace) { tokenText = tokenText.Replace(' ', '\u00A0'); } } // change space to non-breaking space token.SetWord(tokenText); token.SetValue(tokenText); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), charOffsetBegin); token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), charOffsetEnd); if (Verbose) { log.Info("Adding token " + token.ToShorterString()); } return(token); }
/// <summary> /// Only works on English, as it is hard coded for using the /// Morphology class, which is English-only /// </summary> public virtual IList <CoreLabel> Lemmatize <_T0>(IList <_T0> tokens) where _T0 : IHasWord { IList <TaggedWord> tagged; if (GetOp().testOptions.preTag) { IFunction <IList <IHasWord>, IList <TaggedWord> > tagger = LoadTagger(); tagged = tagger.Apply(tokens); } else { Tree tree = Parse(tokens); tagged = tree.TaggedYield(); } Morphology morpha = new Morphology(); IList <CoreLabel> lemmas = Generics.NewArrayList(); foreach (TaggedWord token in tagged) { CoreLabel label = new CoreLabel(); label.SetWord(token.Word()); label.SetTag(token.Tag()); morpha.Stem(label); lemmas.Add(label); } return(lemmas); }
public UnnamedDependency(string regent, string dependent) { // We store the text of the labels separately because it looks like // it is possible for an object to request a hash code using itself // in a partially reconstructed state when unserializing. For // example, a TreeGraphNode might ask for the hash code of an // UnnamedDependency, which then uses an unfilled member of the same // TreeGraphNode to get the hash code. Keeping the text of the // labels breaks that possible cycle. if (regent == null || dependent == null) { throw new ArgumentException("governor or dependent cannot be null"); } CoreLabel headLabel = new CoreLabel(); headLabel.SetValue(regent); headLabel.SetWord(regent); this.regent = headLabel; CoreLabel depLabel = new CoreLabel(); depLabel.SetValue(dependent); depLabel.SetWord(dependent); this.dependent = depLabel; regentText = regent; dependentText = dependent; }
/// <summary> /// Loops back through all the datums inserted for the most recent word /// and inserts statistics about the word they are a part of. /// </summary> /// <remarks> /// Loops back through all the datums inserted for the most recent word /// and inserts statistics about the word they are a part of. This needs to /// be post hoc because the CoreLabel lists coming from testing data sets /// are pre-segmented (so treating each of those CoreLabels as a "word" lets /// us cheat and get 100% classification accuracy by just looking at whether /// we're at the beginning of a "word"). /// </remarks> /// <param name="iobList"/> /// <param name="currentWord"/> /// <param name="wordStartIndex"/> private static void FillInWordStatistics(IList <CoreLabel> iobList, string currentWord, int wordStartIndex) { for (int j = wordStartIndex; j < iobList.Count; j++) { CoreLabel tok = iobList[j]; tok.SetIndex(j - wordStartIndex); tok.SetWord(currentWord); } }
private CoreLabel MkLabel(string word, string ner) { CoreLabel label = new CoreLabel(); label.SetWord(word); label.SetOriginalText(word); label.SetNER(ner); return(label); }
private static CoreLabel InitCoreLabel(string token) { CoreLabel label = new CoreLabel(); label.SetWord(token); label.SetValue(token); label.Set(typeof(CoreAnnotations.TextAnnotation), token); label.Set(typeof(CoreAnnotations.ValueAnnotation), token); return(label); }
/// <summary>Copies the CoreLabel cl with the new word part</summary> private static CoreLabel CopyCoreLabel(CoreLabel cl, string part, int beginPosition, int endPosition) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.SetBeginPosition(beginPosition); newLabel.SetEndPosition(endPosition); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); return(newLabel); }
protected internal virtual CoreLabel MkWord(string gloss, int index) { CoreLabel w = new CoreLabel(); w.SetWord(gloss); w.SetValue(gloss); if (index >= 0) { w.SetIndex(index); } return(w); }
/// <summary>Create a dummy word, just with a given word at a given index.</summary> /// <remarks> /// Create a dummy word, just with a given word at a given index. /// Mostly useful for making semantic graphs. /// </remarks> public static CoreLabel MkWord(string gloss, int index) { CoreLabel w = new CoreLabel(); w.SetWord(gloss); w.SetValue(gloss); if (index >= 0) { w.SetIndex(index); } return(w); }
// Arbitrary test input. We just need to segment something on multiple threads to reproduce // the issue private static IList <CoreLabel> CreateTestTokens() { CoreLabel token = new CoreLabel(); token.SetWord("你好,世界"); token.SetValue("你好,世界"); token.Set(typeof(CoreAnnotations.ChineseSegAnnotation), "1"); token.Set(typeof(CoreAnnotations.AnswerAnnotation), "0"); IList <CoreLabel> labels = new List <CoreLabel>(); labels.Add(token); return(labels); }
public static State InitialStateFromTaggedSentence <_T0>(IList <_T0> words) where _T0 : IHasWord { IList <Tree> preterminals = Generics.NewArrayList(); for (int index = 0; index < words.Count; ++index) { IHasWord hw = words[index]; CoreLabel wordLabel; string tag; if (hw is CoreLabel) { wordLabel = (CoreLabel)hw; tag = wordLabel.Tag(); } else { wordLabel = new CoreLabel(); wordLabel.SetValue(hw.Word()); wordLabel.SetWord(hw.Word()); if (!(hw is IHasTag)) { throw new ArgumentException("Expected tagged words"); } tag = ((IHasTag)hw).Tag(); wordLabel.SetTag(tag); } if (tag == null) { throw new ArgumentException("Input word not tagged"); } CoreLabel tagLabel = new CoreLabel(); tagLabel.SetValue(tag); // Index from 1. Tools downstream from the parser expect that // Internally this parser uses the index, so we have to // overwrite incorrect indices if the label is already indexed wordLabel.SetIndex(index + 1); tagLabel.SetIndex(index + 1); LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel); LabeledScoredTreeNode tagNode = new LabeledScoredTreeNode(tagLabel); tagNode.AddChild(wordNode); // TODO: can we get away with not setting these on the wordLabel? wordLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel); wordLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel); tagLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel); tagLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel); preterminals.Add(tagNode); } return(new State(preterminals)); }
/// <summary>Splits a compound marked by the lexer.</summary> private CoreLabel ProcessCompound(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string[] parts = cl.Word().ReplaceAll("-", " - ").Split("\\s+"); foreach (string part in parts) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); compoundBuffer.Add(newLabel); } return(compoundBuffer.Remove(0)); }
public virtual void TestCoreLabelSetWordBehavior() { CoreLabel foo = new CoreLabel(); foo.Set(typeof(CoreAnnotations.TextAnnotation), "foo"); foo.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), "B"); foo.Set(typeof(CoreAnnotations.LemmaAnnotation), "fool"); // Lemma gets removed with word ArrayCoreMap copy = new ArrayCoreMap(foo); NUnit.Framework.Assert.AreEqual(copy, foo); foo.SetWord("foo"); NUnit.Framework.Assert.AreEqual(copy, foo); // same word set foo.SetWord("bar"); NUnit.Framework.Assert.IsFalse(copy.Equals(foo)); // lemma removed foo.SetWord("foo"); NUnit.Framework.Assert.IsFalse(copy.Equals(foo)); // still removed foo.Set(typeof(CoreAnnotations.LemmaAnnotation), "fool"); NUnit.Framework.Assert.AreEqual(copy, foo); // back to normal // Hash code is consistent int hashCode = foo.GetHashCode(); NUnit.Framework.Assert.AreEqual(copy.GetHashCode(), hashCode); foo.SetWord("bar"); NUnit.Framework.Assert.IsFalse(hashCode == foo.GetHashCode()); foo.SetWord("foo"); NUnit.Framework.Assert.IsFalse(hashCode == foo.GetHashCode()); // Hash code doesn't care between a value of null and the key not existing NUnit.Framework.Assert.IsTrue(foo.Lemma() == null); int lemmalessHashCode = foo.GetHashCode(); foo.Remove(typeof(CoreAnnotations.LemmaAnnotation)); NUnit.Framework.Assert.AreEqual(lemmalessHashCode, foo.GetHashCode()); foo.SetLemma(null); NUnit.Framework.Assert.AreEqual(lemmalessHashCode, foo.GetHashCode()); foo.SetLemma("fool"); NUnit.Framework.Assert.AreEqual(hashCode, foo.GetHashCode()); // Check equals foo.SetWord("bar"); foo.SetWord("foo"); ArrayCoreMap nulledCopy = new ArrayCoreMap(foo); NUnit.Framework.Assert.AreEqual(nulledCopy, foo); foo.Remove(typeof(CoreAnnotations.LemmaAnnotation)); NUnit.Framework.Assert.AreEqual(nulledCopy, foo); }
public virtual void SetUp() { lock (typeof(RegexNERSequenceClassifierTest)) { if (tempFile == null) { tempFile = File.CreateTempFile("regexnertest.patterns", "txt"); FileWriter fout = new FileWriter(tempFile); BufferedWriter bout = new BufferedWriter(fout); bout.Write("sausage\tfood\n"); bout.Write("(avocet|curlew)(s?)\tshorebird\n"); bout.Write("shoreline park\tpark\n"); bout.Flush(); fout.Close(); } } sentences = new List <IList <CoreLabel> >(); NERsentences = new List <IList <CoreLabel> >(); NUnit.Framework.Assert.AreEqual(words.Length, tags.Length); NUnit.Framework.Assert.AreEqual(words.Length, ner.Length); for (int snum = 0; snum < words.Length; ++snum) { string[] wordPieces = words[snum].Split(" "); string[] tagPieces = tags[snum].Split(" "); string[] nerPieces = ner[snum].Split(" "); NUnit.Framework.Assert.AreEqual(wordPieces.Length, tagPieces.Length); NUnit.Framework.Assert.AreEqual(wordPieces.Length, nerPieces.Length, "Input " + snum + " " + words[snum] + " of different length than " + ner[snum]); IList <CoreLabel> sentence = new List <CoreLabel>(); IList <CoreLabel> NERsentence = new List <CoreLabel>(); for (int wnum = 0; wnum < wordPieces.Length; ++wnum) { CoreLabel token = new CoreLabel(); token.SetWord(wordPieces[wnum]); token.SetTag(tagPieces[wnum]); sentence.Add(token); CoreLabel NERtoken = new CoreLabel(); NERtoken.SetWord(wordPieces[wnum]); NERtoken.SetTag(tagPieces[wnum]); NERtoken.SetNER(nerPieces[wnum]); NERsentence.Add(NERtoken); } sentences.Add(sentence); NERsentences.Add(NERsentence); } }
/// <summary> /// Convert a String to a list of characters suitable for labeling in an IOB /// segmentation model. /// </summary> /// <param name="tokenList"/> /// <param name="segMarker"/> /// <param name="applyRewriteRules">add rewrite labels (for training data)</param> /// <param name="stripRewrites"> /// revert training data to old Green and DeNero model (remove /// rewrite labels but still rewrite to try to preserve raw text) /// </param> /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param> /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param> public static IList <CoreLabel> StringToIOB(IList <CoreLabel> tokenList, char segMarker, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText) { IList <CoreLabel> iobList = new List <CoreLabel>(tokenList.Count * 7 + tokenList.Count); string strSegMarker = segMarker.ToString(); bool addWhitespace = false; int numTokens = tokenList.Count; string lastToken = string.Empty; string currentWord = string.Empty; int wordStartIndex = 0; foreach (CoreLabel cl in tokenList) { // What type of token is this if (addWhitespace) { FillInWordStatistics(iobList, currentWord, wordStartIndex); currentWord = string.Empty; wordStartIndex = iobList.Count + 1; iobList.Add(CreateDatum(cl, BoundaryChar, BoundarySymbol)); CoreLabel boundaryDatum = iobList[iobList.Count - 1]; boundaryDatum.SetIndex(0); boundaryDatum.SetWord(string.Empty); addWhitespace = false; } string token = cl.Word(); IOBUtils.TokenType tokType = GetTokenType(token, strSegMarker); token = StripSegmentationMarkers(token, tokType); System.Diagnostics.Debug.Assert(token.Length != 0); if (ShouldNotSegment(token)) { iobList.Add(CreateDatum(cl, token, NosegSymbol)); addWhitespace = true; } else { // Iterate over the characters in the token TokenToDatums(iobList, cl, token, tokType, cl, lastToken, applyRewriteRules, stripRewrites, tf, origText); addWhitespace = (tokType == IOBUtils.TokenType.BeginMarker || tokType == IOBUtils.TokenType.NoMarker); } currentWord += token; lastToken = token; } FillInWordStatistics(iobList, currentWord, wordStartIndex); return(iobList); }
/// <summary>Splits a compound marked by the lexer.</summary> private CoreLabel ProcessCompound(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string[] parts = pSpace.Split(pDash.Matcher(cl.Word()).ReplaceAll(" - ")); int lengthAccum = 0; foreach (string part in parts) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.SetBeginPosition(cl.BeginPosition() + lengthAccum); newLabel.SetEndPosition(cl.BeginPosition() + lengthAccum + part.Length); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); compoundBuffer.Add(newLabel); lengthAccum += part.Length; } return(compoundBuffer.Remove(0)); }
public virtual IList <CoreLabel> SegmentStringToTokenList(string line) { IList <CoreLabel> tokenList = CollectionUtils.MakeList(); IList <CoreLabel> labeledSequence = SegmentStringToIOB(line); foreach (IntPair span in IOBUtils.TokenSpansForIOB(labeledSequence)) { CoreLabel token = new CoreLabel(); string text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.GetSource(), span.GetTarget()); token.SetWord(text); token.SetValue(text); token.Set(typeof(CoreAnnotations.TextAnnotation), text); token.Set(typeof(CoreAnnotations.ArabicSegAnnotation), "1"); int start = labeledSequence[span.GetSource()].BeginPosition(); int end = labeledSequence[span.GetTarget() - 1].EndPosition(); token.SetOriginalText(Sharpen.Runtime.Substring(line, start, end)); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), start); token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end); tokenList.Add(token); } return(tokenList); }
public UnnamedDependency(string regent, string dependent) { if (regent == null || dependent == null) { throw new ArgumentException("governor or dependent cannot be null"); } var headLabel = new CoreLabel(); headLabel.SetValue(regent); headLabel.SetWord(regent); this._regent = headLabel; var depLabel = new CoreLabel(); depLabel.SetValue(dependent); depLabel.SetWord(dependent); this._dependent = depLabel; RegentText = regent; DependentText = dependent; }
public override ILabel Label() { // TODO: move this CoreLabel construction logic somewhere appropriate var cLabel = new CoreLabel(); if (this.parse.IsLeaf) { cLabel.SetWord(this.parse.Value); cLabel.SetBeginPosition(this.parse.Span.Start); cLabel.SetEndPosition(this.parse.Span.End); cLabel.SetValue(this.parse.Value); } else { cLabel.SetCategory(this.parse.Type); cLabel.SetValue(this.parse.Type); if (this.Depth() == 1) { cLabel.SetTag(this.parse.Type); } } return cLabel; }
public override ILabel Label() { // TODO: move this CoreLabel construction logic somewhere appropriate var cLabel = new CoreLabel(); if (this.parse.IsLeaf) { cLabel.SetWord(this.parse.Value); cLabel.SetBeginPosition(this.parse.Span.Start); cLabel.SetEndPosition(this.parse.Span.End); cLabel.SetValue(this.parse.Value); } else { cLabel.SetCategory(this.parse.Type); cLabel.SetValue(this.parse.Type); if (this.Depth() == 1) { cLabel.SetTag(this.parse.Type); } } return(cLabel); }
public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf) { tree = tree.Prune(emptyFilter, tf).SpliceOut(aOverAFilter, tf); foreach (Tree t in tree) { if (t.IsLeaf()) { //Strip off morphological analyses and place them in the OriginalTextAnnotation, which is //specified by HasContext. if (t.Value().Contains(MorphoFeatureSpecification.MorphoMark)) { string[] toks = t.Value().Split(MorphoFeatureSpecification.MorphoMark); if (toks.Length != 2) { log.Err(string.Format("%s: Word contains malformed morph annotation: %s", this.GetType().FullName, t.Value())); } else { if (t.Label() is CoreLabel) { CoreLabel cl = (CoreLabel)t.Label(); cl.SetValue(string.Intern(toks[0].Trim())); cl.SetWord(string.Intern(toks[0].Trim())); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(toks[0], toks[1]); string lemma = lemmaMorph.First(); string morphAnalysis = lemmaMorph.Second(); if (lemma.Equals(toks[0])) { cl.SetOriginalText(string.Intern(toks[1].Trim())); } else { // TODO(spenceg): Does this help? string newLemma = lexMapper.Map(null, lemma); if (newLemma == null || newLemma.Trim().IsEmpty()) { newLemma = lemma; } string newMorphAnalysis = newLemma + MorphoFeatureSpecification.LemmaMark + morphAnalysis; cl.SetOriginalText(string.Intern(newMorphAnalysis)); } } else { log.Error(string.Format("%s: Cannot store morph analysis in non-CoreLabel: %s", this.GetType().FullName, t.Label().GetType().FullName)); } } } } else { if (t.IsPreTerminal()) { if (t.Value() == null || t.Value().IsEmpty()) { log.Warn(string.Format("%s: missing tag for %s", this.GetType().FullName, t.PennString())); } else { if (t.Label() is IHasTag) { ((IHasTag)t.Label()).SetTag(t.Value()); } } } else { //Phrasal nodes // there are some nodes "/" missing preterminals. We'll splice in a tag for these. int nk = t.NumChildren(); IList <Tree> newKids = new List <Tree>(nk); for (int j = 0; j < nk; j++) { Tree child = t.GetChild(j); if (child.IsLeaf()) { log.Warn(string.Format("%s: Splicing in DUMMYTAG for %s", this.GetType().FullName, t.ToString())); newKids.Add(tf.NewTreeNode("DUMMYTAG", Java.Util.Collections.SingletonList(child))); } else { newKids.Add(child); } } t.SetChildren(newKids); } } } //Every node in the tree has now been processed // // Additional processing for specific phrasal annotations // // special global coding for moving PRD annotation from constituent to verb tag. if (markPRDverb) { TregexMatcher m = prdVerbPattern.Matcher(tree); Tree match = null; while (m.Find()) { if (m.GetMatch() != match) { match = m.GetMatch(); match.Label().SetValue(match.Label().Value() + "-PRDverb"); Tree prd = m.GetNode("prd"); prd.Label().SetValue(base.NormalizeNonterminal(prd.Label().Value())); } } } //Mark *only* subjects in verb-initial clauses if (retainNPSbj) { TregexMatcher m = npSbjPattern.Matcher(tree); while (m.Find()) { Tree match = m.GetMatch(); match.Label().SetValue("NP"); } } if (tree.IsPreTerminal()) { // The whole tree is a bare tag: bad! string val = tree.Label().Value(); if (val.Equals("CC") || val.StartsWith("PUNC") || val.Equals("CONJ")) { log.Warn(string.Format("%s: Bare tagged word being wrapped in FRAG %s", this.GetType().FullName, tree.PennString())); tree = tf.NewTreeNode("FRAG", Java.Util.Collections.SingletonList(tree)); } else { log.Warn(string.Format("%s: Bare tagged word %s", this.GetType().FullName, tree.PennString())); } } //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets. //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree. while (tree != null && (tree.Value() == null || tree.Value().IsEmpty()) && tree.NumChildren() <= 1) { tree = tree.FirstChild(); } if (tree != null && !tree.Value().Equals(rootLabel)) { tree = tf.NewTreeNode(rootLabel, Java.Util.Collections.SingletonList(tree)); } return(tree); }
public static void MungeLeaves(Tree tree, bool lemmasAsLeaves, bool addMorphoToLeaves) { IList <ILabel> labels = tree.Yield(); foreach (ILabel label in labels) { ++nTokens; if (!(label is CoreLabel)) { throw new ArgumentException("Only works with CoreLabels trees"); } CoreLabel coreLabel = (CoreLabel)label; string lemma = coreLabel.Lemma(); //PTB escaping since we're going to put this in the leaf if (lemma == null) { // No lemma, so just add the surface form lemma = coreLabel.Word(); } else { if (lemma.Equals("(")) { lemma = "-LRB-"; } else { if (lemma.Equals(")")) { lemma = "-RRB-"; } } } if (lemmasAsLeaves) { string escapedLemma = lemma; coreLabel.SetWord(escapedLemma); coreLabel.SetValue(escapedLemma); coreLabel.SetLemma(lemma); } if (addMorphoToLeaves) { string morphStr = coreLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = MorphoFeatureSpecification.NoAnalysis; } else { ++nMorphAnalyses; } // Normalize punctuation analyses if (morphStr.StartsWith("PONCT")) { morphStr = "PUNC"; } string newLeaf = string.Format("%s%s%s%s%s", coreLabel.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, morphStr); coreLabel.SetValue(newLeaf); coreLabel.SetWord(newLeaf); } } }
/// <summary>Create an Annotation object (with a single sentence) from the given specification.</summary> private static Annotation ParseSentence(Optional <string> docid, Optional <int> sentenceIndex, string gloss, Func <IList <CoreLabel>, SemanticGraph> tree, Func <IList <CoreLabel>, SemanticGraph> maltTree, IList <string> words, IList <string > lemmas, IList <string> pos, IList <string> ner, Optional <string> sentenceid) { // Error checks if (lemmas.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + lemmas.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (pos.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + pos.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (ner.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + ner.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } // Create structure IList <CoreLabel> tokens = new List <CoreLabel>(words.Count); int beginChar = 0; for (int i = 0; i < words.Count; ++i) { CoreLabel token = new CoreLabel(12); token.SetWord(words[i]); token.SetValue(words[i]); token.SetBeginPosition(beginChar); token.SetEndPosition(beginChar + words[i].Length); beginChar += words[i].Length + 1; token.SetLemma(lemmas[i]); token.SetTag(pos[i]); token.SetNER(ner[i]); token.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); token.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); token.Set(typeof(CoreAnnotations.IndexAnnotation), i + 1); token.Set(typeof(CoreAnnotations.TokenBeginAnnotation), i); token.Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1); tokens.Add(token); } gloss = gloss.Replace("\\n", "\n").Replace("\\t", "\t"); ICoreMap sentence = new ArrayCoreMap(16); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); SemanticGraph graph = tree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph); SemanticGraph maltGraph = maltTree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), maltGraph); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); sentence.Set(typeof(CoreAnnotations.TextAnnotation), gloss); sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0); sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Count); Annotation doc = new Annotation(gloss); doc.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); doc.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence)); doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); doc.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); return(doc); }
/// <exception cref="System.IO.IOException"/> public static IList <ICoreMap> ParseFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix) { Pattern startingLabelToken = Pattern.Compile("<(" + StringUtils.Join(categoriesAllowed, "|") + ")>"); Pattern endLabelToken = Pattern.Compile("</(" + StringUtils.Join(categoriesAllowed, "|") + ")>"); string backgroundSymbol = "O"; IList <ICoreMap> sentences = new List <ICoreMap>(); int lineNum = -1; string l = null; while ((l = reader.ReadLine()) != null) { lineNum++; string[] t = l.Split("\t", 2); string id = null; string text = null; if (t.Length == 2) { id = t[0]; text = t[1]; } else { if (t.Length == 1) { text = t[0]; id = lineNum.ToString(); } } id = sentIDprefix + id; DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text)); PTBTokenizer.PTBTokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.NewCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false"); dp.SetTokenizerFactory(tokenizerFactory); string label = backgroundSymbol; int sentNum = -1; foreach (IList <IHasWord> sentence in dp) { sentNum++; string sentStr = string.Empty; IList <CoreLabel> sent = new List <CoreLabel>(); foreach (IHasWord tokw in sentence) { string tok = tokw.Word(); Matcher startingMatcher = startingLabelToken.Matcher(tok); Matcher endMatcher = endLabelToken.Matcher(tok); if (startingMatcher.Matches()) { //System.out.println("matched starting"); label = startingMatcher.Group(1); } else { if (endMatcher.Matches()) { //System.out.println("matched end"); label = backgroundSymbol; } else { CoreLabel c = new CoreLabel(); IList <string> toks = new List <string>(); toks.Add(tok); foreach (string toksplit in toks) { sentStr += " " + toksplit; c.SetWord(toksplit); c.SetLemma(toksplit); c.SetValue(toksplit); c.Set(typeof(CoreAnnotations.TextAnnotation), toksplit); c.Set(typeof(CoreAnnotations.OriginalTextAnnotation), tok); if (setGoldClass) { c.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label); } if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label)) { c.Set(setClassForTheseLabels[label], label); } sent.Add(c); } } } } ICoreMap sentcm = new ArrayCoreMap(); sentcm.Set(typeof(CoreAnnotations.TextAnnotation), sentStr.Trim()); sentcm.Set(typeof(CoreAnnotations.TokensAnnotation), sent); sentcm.Set(typeof(CoreAnnotations.DocIDAnnotation), id + "-" + sentNum); sentences.Add(sentcm); } } return(sentences); }
/// <summary> /// Reads in a single ACE*.apf.xml file and convert it to RelationSentence /// objects. /// </summary> /// <remarks> /// Reads in a single ACE*.apf.xml file and convert it to RelationSentence /// objects. However, you probably should call parse() instead. /// </remarks> /// <param name="prefix"> /// prefix of ACE filename to read (e.g. /// "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01" /// ) (no ".apf.xml" extension) /// </param> /// <returns>list of RelationSentence objects</returns> /// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Xml.Sax.SAXException"/> /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/> private IList <ICoreMap> ReadDocument(string prefix, Annotation corpus) { logger.Info("Reading document: " + prefix); IList <ICoreMap> results = new List <ICoreMap>(); AceDocument aceDocument; if (aceVersion.Equals("ACE2004")) { aceDocument = AceDocument.ParseDocument(prefix, false, aceVersion); } else { aceDocument = AceDocument.ParseDocument(prefix, false); } string docId = aceDocument.GetId(); // map entity mention ID strings to their EntityMention counterparts IDictionary <string, EntityMention> entityMentionMap = Generics.NewHashMap(); /* * for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) { * List<AceToken> tokens = aceDocument.getSentence(sentenceIndex); * StringBuffer b = new StringBuffer(); * for(AceToken t: tokens) b.append(t.getLiteral() + " " ); * logger.info("SENTENCE: " + b.toString()); * } */ int tokenOffset = 0; for (int sentenceIndex = 0; sentenceIndex < aceDocument.GetSentenceCount(); sentenceIndex++) { IList <AceToken> tokens = aceDocument.GetSentence(sentenceIndex); IList <CoreLabel> words = new List <CoreLabel>(); StringBuilder textContent = new StringBuilder(); for (int i = 0; i < tokens.Count; i++) { CoreLabel l = new CoreLabel(); l.SetWord(tokens[i].GetLiteral()); l.Set(typeof(CoreAnnotations.ValueAnnotation), l.Word()); l.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), tokens[i].GetByteStart()); l.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), tokens[i].GetByteEnd()); words.Add(l); if (i > 0) { textContent.Append(" "); } textContent.Append(tokens[i].GetLiteral()); } // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer) if (words.Count == 1) { string word = words[0].Word(); if (word.StartsWith("<") && word.EndsWith(">")) { tokenOffset += tokens.Count; continue; } } ICoreMap sentence = new Annotation(textContent.ToString()); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docId); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), words); logger.Info("Reading sentence: \"" + textContent + "\""); IList <AceEntityMention> entityMentions = aceDocument.GetEntityMentions(sentenceIndex); IList <AceRelationMention> relationMentions = aceDocument.GetRelationMentions(sentenceIndex); IList <AceEventMention> eventMentions = aceDocument.GetEventMentions(sentenceIndex); // convert entity mentions foreach (AceEntityMention aceEntityMention in entityMentions) { string corefID = string.Empty; foreach (string entityID in aceDocument.GetKeySetEntities()) { AceEntity e = aceDocument.GetEntity(entityID); if (e.GetMentions().Contains(aceEntityMention)) { corefID = entityID; break; } } EntityMention convertedMention = ConvertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID); // EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset); entityCounts.IncrementCount(convertedMention.GetType()); logger.Info("CONVERTED MENTION HEAD SPAN: " + convertedMention.GetHead()); logger.Info("CONVERTED ENTITY MENTION: " + convertedMention); AnnotationUtils.AddEntityMention(sentence, convertedMention); entityMentionMap[aceEntityMention.GetId()] = convertedMention; } // TODO: make Entity objects as needed // convert relation mentions foreach (AceRelationMention aceRelationMention in relationMentions) { RelationMention convertedMention = ConvertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap); if (convertedMention != null) { relationCounts.IncrementCount(convertedMention.GetType()); logger.Info("CONVERTED RELATION MENTION: " + convertedMention); AnnotationUtils.AddRelationMention(sentence, convertedMention); } } // TODO: make Relation objects // convert EventMentions foreach (AceEventMention aceEventMention in eventMentions) { EventMention convertedMention = ConvertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset); if (convertedMention != null) { eventCounts.IncrementCount(convertedMention.GetType()); logger.Info("CONVERTED EVENT MENTION: " + convertedMention); AnnotationUtils.AddEventMention(sentence, convertedMention); } } // TODO: make Event objects results.Add(sentence); tokenOffset += tokens.Count; } return(results); }
private Annotation ReadSentence(string docId, IEnumerator <string> lineIterator) { Annotation sentence = new Annotation(string.Empty); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docId); sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), new List <EntityMention>()); // we'll need to set things like the tokens and textContent after we've // fully read the sentence // contains the full text that we've read so far StringBuilder textContent = new StringBuilder(); int tokenCount = 0; // how many tokens we've seen so far IList <CoreLabel> tokens = new List <CoreLabel>(); // when we've seen two blank lines in a row, this sentence is over (one // blank line separates the sentence and the relations int numBlankLinesSeen = 0; string sentenceID = null; // keeps tracks of entities we've seen so far for use by relations IDictionary <string, EntityMention> indexToEntityMention = new Dictionary <string, EntityMention>(); while (lineIterator.MoveNext() && numBlankLinesSeen < 2) { string currentLine = lineIterator.Current; currentLine = currentLine.Replace("COMMA", ","); IList <string> pieces = StringUtils.Split(currentLine); string identifier; int size = pieces.Count; switch (size) { case 1: { // blank line between sentences or relations numBlankLinesSeen++; break; } case 3: { // relation string type = pieces[2]; IList <ExtractionObject> args = new List <ExtractionObject>(); EntityMention entity1 = indexToEntityMention[pieces[0]]; EntityMention entity2 = indexToEntityMention[pieces[1]]; args.Add(entity1); args.Add(entity2); Span span = new Span(entity1.GetExtentTokenStart(), entity2.GetExtentTokenEnd()); // identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size(); identifier = RelationMention.MakeUniqueId(); RelationMention relationMention = new RelationMention(identifier, sentence, span, type, null, args); AnnotationUtils.AddRelationMention(sentence, relationMention); break; } case 9: { // token /* * Roth token lines look like this: * * 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O */ // Entities may be multiple words joined by '/'; we split these up IList <string> words = StringUtils.Split(pieces[5], "/"); //List<String> postags = StringUtils.split(pieces.get(4),"/"); string text = StringUtils.Join(words, " "); identifier = "entity" + pieces[0] + '-' + pieces[2]; string nerTag = GetNormalizedNERTag(pieces[1]); // entity type of the word/expression if (sentenceID == null) { sentenceID = pieces[0]; } if (!nerTag.Equals("O")) { Span extentSpan = new Span(tokenCount, tokenCount + words.Count); // Temporarily sets the head span to equal the extent span. // This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called. // The head span is later modified if preprocessSentences is called. EntityMention entity = new EntityMention(identifier, sentence, extentSpan, extentSpan, nerTag, null, null); AnnotationUtils.AddEntityMention(sentence, entity); // we can get by using these indices as strings since we only use them // as a hash key string index = pieces[2]; indexToEntityMention[index] = entity; } // int i =0; foreach (string word in words) { CoreLabel label = new CoreLabel(); label.SetWord(word); //label.setTag(postags.get(i)); label.Set(typeof(CoreAnnotations.TextAnnotation), word); label.Set(typeof(CoreAnnotations.ValueAnnotation), word); // we don't set TokenBeginAnnotation or TokenEndAnnotation since we're // not keeping track of character offsets tokens.Add(label); } // i++; textContent.Append(text); textContent.Append(' '); tokenCount += words.Count; break; } } } sentence.Set(typeof(CoreAnnotations.TextAnnotation), textContent.ToString()); sentence.Set(typeof(CoreAnnotations.ValueAnnotation), textContent.ToString()); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); sentence.Set(typeof(CoreAnnotations.SentenceIDAnnotation), sentenceID); return(sentence); }
/// <summary>Parse a sentence represented as a List of tokens.</summary> /// <remarks> /// Parse a sentence represented as a List of tokens. /// The text must already have been tokenized and /// normalized into tokens that are appropriate to the treebank /// which was used to train the parser. The tokens can be of /// multiple types, and the list items need not be homogeneous as to type /// (in particular, only some words might be given tags): /// <ul> /// <li>If a token implements HasWord, then the word to be parsed is /// given by its word() value.</li> /// <li>If a token implements HasTag and the tag() value is not /// null or the empty String, then the parser is strongly advised to assign /// a part of speech tag that <i>begins</i> with this String.</li> /// </ul> /// </remarks> /// <param name="sentence">The sentence to parse</param> /// <returns>true Iff the sentence was accepted by the grammar</returns> /// <exception cref="System.NotSupportedException"> /// If the Sentence is too long or /// of zero length or the parse /// otherwise fails for resource reasons /// </exception> private bool ParseInternal <_T0>(IList <_T0> sentence) where _T0 : IHasWord { parseSucceeded = false; parseNoMemory = false; parseUnparsable = false; parseSkipped = false; parseFallback = false; whatFailed = null; addedPunct = false; originalSentence = sentence; int length = sentence.Count; if (length == 0) { parseSkipped = true; throw new NotSupportedException("Can't parse a zero-length sentence!"); } IList <IHasWord> sentenceB; if (op.wordFunction != null) { sentenceB = Generics.NewArrayList(); foreach (IHasWord word in originalSentence) { if (word is ILabel) { ILabel label = (ILabel)word; ILabel newLabel = label.LabelFactory().NewLabel(label); if (newLabel is IHasWord) { sentenceB.Add((IHasWord)newLabel); } else { throw new AssertionError("This should have been a HasWord"); } } else { if (word is IHasTag) { TaggedWord tw = new TaggedWord(word.Word(), ((IHasTag)word).Tag()); sentenceB.Add(tw); } else { sentenceB.Add(new Word(word.Word())); } } } foreach (IHasWord word_1 in sentenceB) { word_1.SetWord(op.wordFunction.Apply(word_1.Word())); } } else { sentenceB = new List <IHasWord>(sentence); } if (op.testOptions.addMissingFinalPunctuation) { addedPunct = AddSentenceFinalPunctIfNeeded(sentenceB, length); } if (length > op.testOptions.maxLength) { parseSkipped = true; throw new NotSupportedException("Sentence too long: length " + length); } TreePrint treePrint = GetTreePrint(); PrintWriter pwOut = op.tlpParams.Pw(); //Insert the boundary symbol if (sentence[0] is CoreLabel) { CoreLabel boundary = new CoreLabel(); boundary.SetWord(LexiconConstants.Boundary); boundary.SetValue(LexiconConstants.Boundary); boundary.SetTag(LexiconConstants.BoundaryTag); boundary.SetIndex(sentence.Count + 1); //1-based indexing used in the parser sentenceB.Add(boundary); } else { sentenceB.Add(new TaggedWord(LexiconConstants.Boundary, LexiconConstants.BoundaryTag)); } if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } if (op.doPCFG) { if (!pparser.Parse(sentenceB)) { return(parseSucceeded); } if (op.testOptions.verbose) { pwOut.Println("PParser output"); // getBestPCFGParse(false).pennPrint(pwOut); // with scores on nodes treePrint.PrintTree(GetBestPCFGParse(false), pwOut); } } // without scores on nodes if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } if (op.doDep && !op.testOptions.useFastFactored) { if (!dparser.Parse(sentenceB)) { return(parseSucceeded); } // cdm nov 2006: should move these printing bits to the main printing section, // so don't calculate the best parse twice! if (op.testOptions.verbose) { pwOut.Println("DParser output"); treePrint.PrintTree(dparser.GetBestParse(), pwOut); } } if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } if (op.doPCFG && op.doDep) { if (!bparser.Parse(sentenceB)) { return(parseSucceeded); } else { parseSucceeded = true; } } return(true); }