private CoreLabel MakeXmlToken(string tokenText, bool doNormalization, int charOffsetBegin, int charOffsetEnd) { CoreLabel token = new CoreLabel(); token.SetOriginalText(tokenText); if (separatorPattern.Matcher(tokenText).Matches()) { // Map to CoreNLP newline token tokenText = AbstractTokenizer.NewlineToken; } else { if (doNormalization && normalizeSpace) { tokenText = tokenText.Replace(' ', '\u00A0'); } } // change space to non-breaking space token.SetWord(tokenText); token.SetValue(tokenText); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), charOffsetBegin); token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), charOffsetEnd); if (Verbose) { log.Info("Adding token " + token.ToShorterString()); } return(token); }
private CoreLabel MkLabel(string word, string ner) { CoreLabel label = new CoreLabel(); label.SetWord(word); label.SetOriginalText(word); label.SetNER(ner); return(label); }
public virtual IList <CoreLabel> SegmentStringToTokenList(string line) { IList <CoreLabel> tokenList = CollectionUtils.MakeList(); IList <CoreLabel> labeledSequence = SegmentStringToIOB(line); foreach (IntPair span in IOBUtils.TokenSpansForIOB(labeledSequence)) { CoreLabel token = new CoreLabel(); string text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.GetSource(), span.GetTarget()); token.SetWord(text); token.SetValue(text); token.Set(typeof(CoreAnnotations.TextAnnotation), text); token.Set(typeof(CoreAnnotations.ArabicSegAnnotation), "1"); int start = labeledSequence[span.GetSource()].BeginPosition(); int end = labeledSequence[span.GetTarget() - 1].EndPosition(); token.SetOriginalText(Sharpen.Runtime.Substring(line, start, end)); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), start); token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end); tokenList.Add(token); } return(tokenList); }
/// <summary> /// Handles verbs with attached suffixes, marked by the lexer: /// Escribamosela => Escribamo + se + la => escribamos + se + la /// Sentaos => senta + os => sentad + os /// Damelo => da + me + lo /// </summary> private CoreLabel ProcessVerb(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); SpanishVerbStripper.StrippedVerb stripped = verbStripper.SeparatePronouns(cl.Word()); if (stripped == null) { return(cl); } // Split the CoreLabel into separate labels, tracking changing begin + end // positions. int stemEnd = cl.BeginPosition() + stripped.GetOriginalStem().Length; int lengthRemoved = 0; foreach (string pronoun in stripped.GetPronouns()) { int beginOffset = stemEnd + lengthRemoved; compoundBuffer.Add(CopyCoreLabel(cl, pronoun, beginOffset)); lengthRemoved += pronoun.Length; } CoreLabel stem = CopyCoreLabel(cl, stripped.GetStem(), cl.BeginPosition(), stemEnd); stem.SetOriginalText(stripped.GetOriginalStem()); return(stem); }
public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf) { tree = tree.Prune(emptyFilter, tf).SpliceOut(aOverAFilter, tf); foreach (Tree t in tree) { if (t.IsLeaf()) { //Strip off morphological analyses and place them in the OriginalTextAnnotation, which is //specified by HasContext. if (t.Value().Contains(MorphoFeatureSpecification.MorphoMark)) { string[] toks = t.Value().Split(MorphoFeatureSpecification.MorphoMark); if (toks.Length != 2) { log.Err(string.Format("%s: Word contains malformed morph annotation: %s", this.GetType().FullName, t.Value())); } else { if (t.Label() is CoreLabel) { CoreLabel cl = (CoreLabel)t.Label(); cl.SetValue(string.Intern(toks[0].Trim())); cl.SetWord(string.Intern(toks[0].Trim())); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(toks[0], toks[1]); string lemma = lemmaMorph.First(); string morphAnalysis = lemmaMorph.Second(); if (lemma.Equals(toks[0])) { cl.SetOriginalText(string.Intern(toks[1].Trim())); } else { // TODO(spenceg): Does this help? string newLemma = lexMapper.Map(null, lemma); if (newLemma == null || newLemma.Trim().IsEmpty()) { newLemma = lemma; } string newMorphAnalysis = newLemma + MorphoFeatureSpecification.LemmaMark + morphAnalysis; cl.SetOriginalText(string.Intern(newMorphAnalysis)); } } else { log.Error(string.Format("%s: Cannot store morph analysis in non-CoreLabel: %s", this.GetType().FullName, t.Label().GetType().FullName)); } } } } else { if (t.IsPreTerminal()) { if (t.Value() == null || t.Value().IsEmpty()) { log.Warn(string.Format("%s: missing tag for %s", this.GetType().FullName, t.PennString())); } else { if (t.Label() is IHasTag) { ((IHasTag)t.Label()).SetTag(t.Value()); } } } else { //Phrasal nodes // there are some nodes "/" missing preterminals. We'll splice in a tag for these. int nk = t.NumChildren(); IList <Tree> newKids = new List <Tree>(nk); for (int j = 0; j < nk; j++) { Tree child = t.GetChild(j); if (child.IsLeaf()) { log.Warn(string.Format("%s: Splicing in DUMMYTAG for %s", this.GetType().FullName, t.ToString())); newKids.Add(tf.NewTreeNode("DUMMYTAG", Java.Util.Collections.SingletonList(child))); } else { newKids.Add(child); } } t.SetChildren(newKids); } } } //Every node in the tree has now been processed // // Additional processing for specific phrasal annotations // // special global coding for moving PRD annotation from constituent to verb tag. if (markPRDverb) { TregexMatcher m = prdVerbPattern.Matcher(tree); Tree match = null; while (m.Find()) { if (m.GetMatch() != match) { match = m.GetMatch(); match.Label().SetValue(match.Label().Value() + "-PRDverb"); Tree prd = m.GetNode("prd"); prd.Label().SetValue(base.NormalizeNonterminal(prd.Label().Value())); } } } //Mark *only* subjects in verb-initial clauses if (retainNPSbj) { TregexMatcher m = npSbjPattern.Matcher(tree); while (m.Find()) { Tree match = m.GetMatch(); match.Label().SetValue("NP"); } } if (tree.IsPreTerminal()) { // The whole tree is a bare tag: bad! string val = tree.Label().Value(); if (val.Equals("CC") || val.StartsWith("PUNC") || val.Equals("CONJ")) { log.Warn(string.Format("%s: Bare tagged word being wrapped in FRAG %s", this.GetType().FullName, tree.PennString())); tree = tf.NewTreeNode("FRAG", Java.Util.Collections.SingletonList(tree)); } else { log.Warn(string.Format("%s: Bare tagged word %s", this.GetType().FullName, tree.PennString())); } } //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets. //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree. while (tree != null && (tree.Value() == null || tree.Value().IsEmpty()) && tree.NumChildren() <= 1) { tree = tree.FirstChild(); } if (tree != null && !tree.Value().Equals(rootLabel)) { tree = tf.NewTreeNode(rootLabel, Java.Util.Collections.SingletonList(tree)); } return(tree); }