//is EnglishPU public virtual ICollection <string> FeaturesC(PaddedList <IN> cInfo, int loc) { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel c1 = cInfo[loc + 1]; CoreLabel c2 = cInfo[loc + 2]; CoreLabel c3 = cInfo[loc + 3]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.Get(typeof(CoreAnnotations.CharAnnotation)); string charc1 = c1.Get(typeof(CoreAnnotations.CharAnnotation)); string charc2 = c2.Get(typeof(CoreAnnotations.CharAnnotation)); string charc3 = c3.Get(typeof(CoreAnnotations.CharAnnotation)); string charp = p.Get(typeof(CoreAnnotations.CharAnnotation)); string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation)); string charp3 = p3.Get(typeof(CoreAnnotations.CharAnnotation)); if (flags.useWord1) { // features.add(charc +"c"); // features.add(charc1+"c1"); // features.add(charp +"p"); // features.add(charp +charc +"pc"); // if(flags.useAs || flags.useMsr || flags.usePk || flags.useHk){ //msr, as // features.add(charc +charc1 +"cc1"); // features.add(charp + charc1 +"pc1"); // } features.Add(charc + "::c"); features.Add(charc1 + "::c1"); features.Add(charp + "::p"); features.Add(charp2 + "::p2"); // trying to restore the features that Huishin described in SIGHAN 2005 paper features.Add(charc + charc1 + "::cn"); features.Add(charp + charc + "::pc"); features.Add(charp + charc1 + "::pn"); features.Add(charp2 + charp + "::p2p"); features.Add(charp2 + charc + "::p2c"); features.Add(charc2 + charc + "::n2c"); features.Add("|word1"); } return(features); }
//Note: this doesn't necessarily find all possible candidates, but is kind of a greedy version. // E.g. "Elizabeth and Jane" will return only "Elizabeth and Jane", but not "Elizabeth", and "Jane" as well. public virtual Pair <List <string>, List <Pair <int, int> > > ScanForNamesNew(Pair <int, int> textRun) { List <string> potentialNames = new List <string>(); List <Pair <int, int> > nameIndices = new List <Pair <int, int> >(); IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); Sieve.TokenNode pointer = rootNameNode; for (int index = textRun.first; index <= textRun.second && index < tokens.Count; index++) { CoreLabel token = tokens[index]; string tokenText = token.Word(); // System.out.println(token); if (pointer.childNodes.Keys.Contains(tokenText)) { pointer = pointer.childNodes[tokenText]; } else { if (!pointer.token.Equals("$ROOT")) { if (pointer.fullName != null) { potentialNames.Add(pointer.fullName); nameIndices.Add(new Pair <int, int>(index - 1 - pointer.level, index - 1)); } pointer = rootNameNode; } } } int index_1 = textRun.second + 1; if (!pointer.token.Equals("$ROOT")) { //catch the end case if (pointer.fullName != null) { potentialNames.Add(pointer.fullName); nameIndices.Add(new Pair <int, int>(index_1 - 1 - pointer.level, index_1 - 1)); } pointer = rootNameNode; } return(new Pair <List <string>, List <Pair <int, int> > >(potentialNames, nameIndices)); }
private static void ReplacePOSTag(Tree t, MorphoFeatureSpecification morpho) { if (!t.IsPreTerminal()) { throw new ArgumentException("Can only operate on preterminals"); } if (!(t.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel label = (CoreLabel)t.Label(); Tree child = t.Children()[0]; if (!(child.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel childLabel = (CoreLabel)child.Label(); // Morphological Analysis string morphStr = childLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = label.Value(); // POS subcategory string subCat = childLabel.Category(); if (subCat != null && subCat != string.Empty) { morphStr += "-" + subCat + "--"; } else { morphStr += "---"; } } MorphoFeatures feats = morpho.StrToFeatures(morphStr); if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty)) { label.SetValue(feats.GetAltTag()); label.SetTag(feats.GetAltTag()); } }
/// <summary>The main output here is data/tokens/dickens.oliver.tokens, which contains the original book, one token per line, with part of speech, syntax, NER, coreference and other annotations.</summary> /// <remarks> /// The main output here is data/tokens/dickens.oliver.tokens, which contains the original book, one token per line, with part of speech, syntax, NER, coreference and other annotations. The (tab-separated) format is: /// Paragraph id /// Sentence id /// Token id /// Byte start /// Byte end /// Whitespace following the token (useful for pretty-printing the original text) /// Syntactic head id (-1 for the sentence root) /// Original token /// Normalized token (for quotes etc.) /// Lemma /// Penn Treebank POS tag /// NER tag (PERSON, NUMBER, DATE, DURATION, MISC, TIME, LOCATION, ORDINAL, MONEY, ORGANIZATION, SET, O) /// Stanford basic dependency label /// Within-quotation flag /// Character id (all coreferent tokens share the same character id) /// </remarks> /// <param name="filename"/> public static IDictionary <int, IList <CoreLabel> > ReadTokenFile(string filename, Annotation novel) { IList <string> lines = IOUtils.LinesFromFile(filename); IDictionary <int, IList <CoreLabel> > charsToTokens = new Dictionary <int, IList <CoreLabel> >(); bool first = true; int tokenOffset = 0; foreach (string line in lines) { if (first) { first = false; continue; } string[] pieces = line.Split("\t"); int tokenId = System.Convert.ToInt32(pieces[2]) + tokenOffset; string token = pieces[7]; string normalizedTok = pieces[8]; int characterId = System.Convert.ToInt32(pieces[14]); CoreLabel novelTok = novel.Get(typeof(CoreAnnotations.TokensAnnotation))[tokenId]; // CoreNLP sometimes splits ". . . ." as ". . ." and "." and sometimes lemmatizes it. (The Steppe) if (pieces[7].Equals(". . . .") && !novelTok.Get(typeof(CoreAnnotations.OriginalTextAnnotation)).Equals(". . . .")) { tokenOffset++; } if (characterId != -1) { if (!novelTok.Get(typeof(CoreAnnotations.TextAnnotation)).Equals(normalizedTok)) { System.Console.Error.WriteLine(token + " != " + novelTok.Get(typeof(CoreAnnotations.TextAnnotation))); } else { if (!charsToTokens.Contains(characterId)) { charsToTokens[characterId] = new List <CoreLabel>(); } charsToTokens[characterId].Add(novelTok); } } } return(charsToTokens); }
/// <summary> /// Determine if the given tree contains a leaf which matches the /// part-of-speech and lexical criteria. /// </summary> /// <param name="pos"> /// Regular expression to match part of speech (may be null, /// in which case any POS is allowed) /// </param> /// <param name="pos"> /// Regular expression to match word (may be null, in which /// case any word is allowed) /// </param> private static bool ShouldPrintTree(Tree tree, Pattern pos, Pattern word) { foreach (Tree t in tree) { if (t.IsPreTerminal()) { CoreLabel label = (CoreLabel)t.Label(); string tpos = label.Value(); Tree wordNode = t.FirstChild(); CoreLabel wordLabel = (CoreLabel)wordNode.Label(); string tword = wordLabel.Value(); if ((pos == null || pos.Matcher(tpos).Find()) && (word == null || word.Matcher(tword).Find())) { return(true); } } } return(false); }
public virtual void TestUsingIterator() { string s = "\n\n@@123\nthis\nis\na\nsentence\n\n@@12\nThis\nis another\n.\n\n"; string[] output = new string[] { "@@", "123", "this", "is", "a", "sentence", "@@", "12", "This", "is", "another", "." }; string[] outWSs = new string[] { "@@", "ddd", "xxxx", "xx", "x", "xxxxx", "@@", "dd", "Xxxx", "xx", "xxxxx", "." }; NUnit.Framework.Assert.AreEqual(output.Length, outWSs.Length, "Two output arrays should have same length"); Properties props = PropertiesUtils.AsProperties("wordShape", "chris2"); SeqClassifierFlags flags = new SeqClassifierFlags(props); PlainTextDocumentReaderAndWriter <CoreLabel> readerAndWriter = new PlainTextDocumentReaderAndWriter <CoreLabel>(); readerAndWriter.Init(flags); ReaderIteratorFactory rif = new ReaderIteratorFactory(new StringReader(s)); ObjectBank <IList <CoreLabel> > di = new ObjectBank <IList <CoreLabel> >(rif, readerAndWriter); ICollection <string> knownLCWords = new HashSet <string>(); ObjectBankWrapper <CoreLabel> obw = new ObjectBankWrapper <CoreLabel>(flags, di, knownLCWords); try { int outIdx = 0; for (IEnumerator <IList <CoreLabel> > iter = obw.GetEnumerator(); iter.MoveNext();) { IList <CoreLabel> sent = iter.Current; for (IEnumerator <CoreLabel> iter2 = sent.GetEnumerator(); iter2.MoveNext();) { CoreLabel cl = iter2.Current; string tok = cl.Word(); string shape = cl.Get(typeof(CoreAnnotations.ShapeAnnotation)); NUnit.Framework.Assert.AreEqual(output[outIdx], tok); NUnit.Framework.Assert.AreEqual(outWSs[outIdx], shape); outIdx++; } } if (outIdx < output.Length) { NUnit.Framework.Assert.Fail("Too few things in iterator, lacking: " + output[outIdx]); } } catch (Exception e) { NUnit.Framework.Assert.Fail("Probably too many things in iterator: " + e); } }
/// <summary>Splits a compound marked by the lexer.</summary> private CoreLabel ProcessCompound(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string[] parts = pSpace.Split(pDash.Matcher(cl.Word()).ReplaceAll(" - ")); int lengthAccum = 0; foreach (string part in parts) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.SetBeginPosition(cl.BeginPosition() + lengthAccum); newLabel.SetEndPosition(cl.BeginPosition() + lengthAccum + part.Length); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); compoundBuffer.Add(newLabel); lengthAccum += part.Length; } return(compoundBuffer.Remove(0)); }
// end featuresCpCp2C protected internal virtual ICollection <string> FeaturesCpCp2Cp3C <_T0>(PaddedList <_T0> cInfo, int loc) where _T0 : CoreLabel { ICollection <string> features = new List <string>(); if (flags.use4Clique && flags.maxLeft >= 3) { CoreLabel c = cInfo[loc]; CoreLabel c2 = cInfo[loc + 1]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.GetString <CoreAnnotations.CharAnnotation>(); string charp = p.GetString <CoreAnnotations.CharAnnotation>(); string charp2 = p2.GetString <CoreAnnotations.CharAnnotation>(); string charp3 = p3.GetString <CoreAnnotations.CharAnnotation>(); int cI = c.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec = (cI != null ? cI.ToString() : string.Empty); int c2I = c2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec2 = (c2I != null ? c2I.ToString() : string.Empty); int pI = p.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep = (pI != null ? pI.ToString() : string.Empty); int p2I = p2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep2 = (p2I != null ? p2I.ToString() : string.Empty); int p3I = p3.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep3 = (p3I != null ? p3I.ToString() : string.Empty); if (flags.useLongSequences) { features.Add(charp3 + charp2 + charp + charc + "p3p2pc"); } if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.Add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-uType4"); } if (flags.useUnicodeType5gram) { features.Add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType5"); } features.Add("cliqueCpCp2Cp3C"); } return(features); }
// public void getDecisionTree(Map<String, List<CoreLabel>> sents, // List<Pair<String, Integer>> chosen, Counter<String> weights, String // wekaOptions) { // RVFDataset<String, String> dataset = new RVFDataset<String, String>(); // for (Pair<String, Integer> d : chosen) { // CoreLabel l = sents.get(d.first).get(d.second()); // String w = l.word(); // Integer num = this.clusterIds.get(w); // if (num == null) // num = -1; // double wt = weights.getCount("Cluster-" + num); // String label; // if (l.get(answerClass).toString().equals(answerLabel)) // label = answerLabel; // else // label = "O"; // Counter<String> feat = new ClassicCounter<String>(); // feat.setCount("DIST", wt); // dataset.add(new RVFDatum<String, String>(feat, label)); // } // WekaDatumClassifierFactory wekaFactory = new // WekaDatumClassifierFactory("weka.classifiers.trees.J48", wekaOptions); // WekaDatumClassifier classifier = wekaFactory.trainClassifier(dataset); // Classifier cls = classifier.getClassifier(); // J48 j48decisiontree = (J48) cls; // System.out.println(j48decisiontree.toSummaryString()); // System.out.println(j48decisiontree.toString()); // // } private int Sample(IDictionary <string, DataInstance> sents, Random r, Random rneg, double perSelectNeg, double perSelectRand, int numrand, IList <Pair <string, int> > chosen, RVFDataset <string, string> dataset) { foreach (KeyValuePair <string, DataInstance> en in sents) { CoreLabel[] sent = Sharpen.Collections.ToArray(en.Value.GetTokens(), new CoreLabel[0]); for (int i = 0; i < sent.Length; i++) { CoreLabel l = sent[i]; bool chooseThis = false; if (l.Get(answerClass).Equals(answerLabel)) { chooseThis = true; } else { if ((!l.Get(answerClass).Equals("O") || negativeWords.Contains(l.Word().ToLower())) && GetRandomBoolean(r, perSelectNeg)) { chooseThis = true; } else { if (GetRandomBoolean(r, perSelectRand)) { numrand++; chooseThis = true; } else { chooseThis = false; } } } if (chooseThis) { chosen.Add(new Pair(en.Key, i)); RVFDatum <string, string> d = GetDatum(sent, i); dataset.Add(d, en.Key, int.ToString(i)); } } } return(numrand); }
// static methods /// <summary> /// Sets the labels on the tree (except the leaves) to be the integer /// value of the sentiment prediction. /// </summary> /// <remarks> /// Sets the labels on the tree (except the leaves) to be the integer /// value of the sentiment prediction. Makes it easy to print out /// with Tree.toString() /// </remarks> private static void SetSentimentLabels(Tree tree) { if (tree.IsLeaf()) { return; } foreach (Tree child in tree.Children()) { SetSentimentLabels(child); } ILabel label = tree.Label(); if (!(label is CoreLabel)) { throw new ArgumentException("Required a tree with CoreLabels"); } CoreLabel cl = (CoreLabel)label; cl.SetValue(int.ToString(RNNCoreAnnotations.GetPredictedClass(tree))); }
/// <exception cref="System.IO.IOException"/> public virtual void HandleLemma(string arg, OutputStream outStream) { if (arg == null) { return; } IList <CoreLabel> tokens = parser.Lemmatize(arg); OutputStreamWriter osw = new OutputStreamWriter(outStream, "utf-8"); for (int i = 0; i < tokens.Count; ++i) { CoreLabel word = tokens[i]; if (i > 0) { osw.Write(" "); } osw.Write(word.Lemma()); } osw.Write("\n"); osw.Flush(); }
/// <summary>Find the tree that covers the portion of interest.</summary> private static Tree FindPartialSpan(Tree root, int start) { CoreLabel label = (CoreLabel)root.Label(); int startIndex = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)); if (startIndex == start) { return(root); } foreach (Tree kid in root.Children()) { CoreLabel kidLabel = (CoreLabel)kid.Label(); int kidStart = kidLabel.Get(typeof(CoreAnnotations.BeginIndexAnnotation)); int kidEnd = kidLabel.Get(typeof(CoreAnnotations.EndIndexAnnotation)); if (kidStart <= start && kidEnd > start) { return(FindPartialSpan(kid, start)); } } throw new Exception("Shouldn't happen: " + start + " " + root); }
/// <summary>Find the index of the head of an entity.</summary> /// <param name="ent">The entity mention</param> /// <param name="tree">The Tree for the entire sentence in which it occurs.</param> /// <param name="tokens">The Sentence in which it occurs</param> /// <param name="setHeadSpan">Whether to set the head span in the entity mention.</param> /// <returns>The index of the entity head</returns> public virtual int AssignSyntacticHead(EntityMention ent, Tree tree, IList <CoreLabel> tokens, bool setHeadSpan) { if (ent.GetSyntacticHeadTokenPosition() != -1) { return(ent.GetSyntacticHeadTokenPosition()); } logger.Finest("Finding syntactic head for entity: " + ent + " in tree: " + tree.ToString()); logger.Finest("Flat sentence is: " + tokens); Tree sh = null; try { sh = FindSyntacticHead(ent, tree, tokens); } catch (Exception e) { logger.Severe("WARNING: failed to parse sentence. Will continue with the right-most head heuristic: " + SentenceToString(tokens)); Sharpen.Runtime.PrintStackTrace(e); } int headPos = ent.GetExtentTokenEnd() - 1; if (sh != null) { CoreLabel label = (CoreLabel)sh.Label(); headPos = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)); } else { logger.Fine("WARNING: failed to find syntactic head for entity: " + ent + " in tree: " + tree); logger.Fine("Fallback strategy: will set head to last token in mention: " + tokens[headPos]); } ent.SetHeadTokenPosition(headPos); if (setHeadSpan) { // set the head span to match exactly the syntactic head // this is needed for some corpora where the head span is not given ent.SetHeadTokenSpan(new Span(headPos, headPos + 1)); } return(headPos); }
public virtual IList <CoreLabel> SegmentStringToTokenList(string line) { IList <CoreLabel> tokenList = CollectionUtils.MakeList(); IList <CoreLabel> labeledSequence = SegmentStringToIOB(line); foreach (IntPair span in IOBUtils.TokenSpansForIOB(labeledSequence)) { CoreLabel token = new CoreLabel(); string text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.GetSource(), span.GetTarget()); token.SetWord(text); token.SetValue(text); token.Set(typeof(CoreAnnotations.TextAnnotation), text); token.Set(typeof(CoreAnnotations.ArabicSegAnnotation), "1"); int start = labeledSequence[span.GetSource()].BeginPosition(); int end = labeledSequence[span.GetTarget() - 1].EndPosition(); token.SetOriginalText(Sharpen.Runtime.Substring(line, start, end)); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), start); token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end); tokenList.Add(token); } return(tokenList); }
public override ILabel Label() { // TODO: move this CoreLabel construction logic somewhere appropriate var cLabel = new CoreLabel(); if (this.parse.IsLeaf) { cLabel.SetWord(this.parse.Value); cLabel.SetBeginPosition(this.parse.Span.Start); cLabel.SetEndPosition(this.parse.Span.End); cLabel.SetValue(this.parse.Value); } else { cLabel.SetCategory(this.parse.Type); cLabel.SetValue(this.parse.Type); if (this.Depth() == 1) { cLabel.SetTag(this.parse.Type); } } return cLabel; }
/// <summary>This option also does not seem to help</summary> public virtual void AddEdgeFeatures2(IList <string> features, State state, string nodeName, Tree node) { if (node == null) { return; } int left = ShiftReduceUtils.LeftIndex(node); int right = ShiftReduceUtils.RightIndex(node); CoreLabel nodeLabel = GetCoreLabel(node); string nodeValue = GetFeatureFromCoreLabel(nodeLabel, FeatureFactory.FeatureComponent.Value) + "-"; CoreLabel leftLabel = GetQueueLabel(state, left); CoreLabel rightLabel = GetQueueLabel(state, right); AddUnaryQueueFeatures(features, leftLabel, nodeName + "EL-" + nodeValue); AddUnaryQueueFeatures(features, rightLabel, nodeName + "ER-" + nodeValue); CoreLabel previousLabel = GetQueueLabel(state, left - 1); AddUnaryQueueFeatures(features, previousLabel, nodeName + "EP-" + nodeValue); CoreLabel nextLabel = GetQueueLabel(state, right + 1); AddUnaryQueueFeatures(features, nextLabel, nodeName + "EN-" + nodeValue); }
/// <summary> /// This is the original version of /// <see cref="FindSyntacticHead(Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention, Edu.Stanford.Nlp.Trees.Tree, System.Collections.Generic.IList{E})"/> /// before Chris's modifications. /// There's no good reason to use it except for producing historical results. /// It Finds the syntactic head of the given entity mention. /// </summary> /// <param name="ent">The entity mention</param> /// <param name="root">The Tree for the entire sentence in which it occurs.</param> /// <param name="tokens">The Sentence in which it occurs</param> /// <returns> /// The tree object corresponding to the head. This MUST be a child of root. /// It will be a leaf in the parse tree. /// </returns> public virtual Tree OriginalFindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens) { logger.Fine("Searching for tree matching " + ent); Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd()); // // found an exact match // if (exactMatch != null) { logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch)); return(SafeHead(exactMatch)); } // // no exact match found // in this case, we parse the actual extent of the mention // IList <CoreLabel> extentTokens = new List <CoreLabel>(); for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++) { extentTokens.Add(tokens[i]); } Tree tree = Parse(extentTokens); logger.Fine("No exact match found. Local parse:\n" + tree.PennString()); ConvertToCoreLabels(tree); tree.IndexSpans(ent.GetExtentTokenStart()); Tree extentHead = SafeHead(tree); System.Diagnostics.Debug.Assert((extentHead != null)); // extentHead is a child in the local extent parse tree. we need to find the // corresponding node in the main tree CoreLabel l = (CoreLabel)extentHead.Label(); Tree realHead = FindTreeWithSpan(root, l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), l.Get(typeof(CoreAnnotations.EndIndexAnnotation))); System.Diagnostics.Debug.Assert((realHead != null)); return(realHead); }
public UnnamedDependency(string regent, string dependent) { if (regent == null || dependent == null) { throw new ArgumentException("governor or dependent cannot be null"); } var headLabel = new CoreLabel(); headLabel.SetValue(regent); headLabel.SetWord(regent); this._regent = headLabel; var depLabel = new CoreLabel(); depLabel.SetValue(dependent); depLabel.SetWord(dependent); this._dependent = depLabel; RegentText = regent; DependentText = dependent; }
private Tree FunkyFindLeafWithApproximateSpan(Tree root, string token, int index, int approximateness) { logger.Fine("Looking for " + token + " at pos " + index + " plus upto " + approximateness + " in tree: " + root.PennString()); IList <Tree> leaves = root.GetLeaves(); foreach (Tree leaf in leaves) { CoreLabel label = typeof(CoreLabel).Cast(leaf.Label()); int ind = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)); // log.info("Token #" + ind + ": " + leaf.value()); if (token.Equals(leaf.Value()) && ind >= index && ind <= index + approximateness) { return(leaf); } } // this shouldn't happen // but it does happen (VERY RARELY) on some weird web text that includes SGML tags with spaces // TODO: does this mean that somehow tokenization is different for the parser? check this by throwing an Exception in KBP logger.Severe("GenericDataSetReader: WARNING: Failed to find head token"); logger.Severe(" when looking for " + token + " at pos " + index + " plus upto " + approximateness + " in tree: " + root.PennString()); return(null); }
private Tree FindPartialSpan(Tree current, int start) { CoreLabel label = (CoreLabel)current.Label(); int startIndex = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)); if (startIndex == start) { logger.Fine("findPartialSpan: Returning " + current); return(current); } foreach (Tree kid in current.Children()) { CoreLabel kidLabel = (CoreLabel)kid.Label(); int kidStart = kidLabel.Get(typeof(CoreAnnotations.BeginIndexAnnotation)); int kidEnd = kidLabel.Get(typeof(CoreAnnotations.EndIndexAnnotation)); // log.info("findPartialSpan: Examining " + kidLabel.value() + " from " + kidStart + " to " + kidEnd); if (kidStart <= start && kidEnd > start) { return(FindPartialSpan(kid, start)); } } throw new Exception("Shouldn't happen: " + start + " " + current); }
protected internal override ICollection <string> FeaturesCpC(PaddedList <IN> cInfo, int loc) { ICollection <string> features = base.FeaturesCpC(cInfo, loc); CoreLabel c = cInfo[loc]; // "Wrapper" feature: identity of first and last two chars of the current word. // This helps detect ma+_+sh in dialect, as well as avoiding segmenting possessive // pronouns if the word starts with al-. if (c.Word().Length > 3) { string start = Sharpen.Runtime.Substring(c.Word(), 0, 2); string end = Sharpen.Runtime.Substring(c.Word(), c.Word().Length - 2); if (c.Index() == 2) { features.Add(start + "_" + end + "-begin-wrap"); } if (c.Index() == c.Word().Length - 1) { features.Add(start + "_" + end + "-end-wrap"); } } return(features); }
public override ILabel Label() { // TODO: move this CoreLabel construction logic somewhere appropriate var cLabel = new CoreLabel(); if (this.parse.IsLeaf) { cLabel.SetWord(this.parse.Value); cLabel.SetBeginPosition(this.parse.Span.Start); cLabel.SetEndPosition(this.parse.Span.End); cLabel.SetValue(this.parse.Value); } else { cLabel.SetCategory(this.parse.Type); cLabel.SetValue(this.parse.Type); if (this.Depth() == 1) { cLabel.SetTag(this.parse.Type); } } return(cLabel); }
protected internal override T GetNext() { try { T nextToken = null; do { // Depending on the orthographic normalization options, // some tokens can be obliterated. In this case, keep iterating // until we see a non-zero length token. nextToken = ((splitContractions || splitCompounds) && compoundBuffer.Count > 0) ? (T)compoundBuffer.Remove(0) : (T)lexer.Next(); }while (nextToken != null && nextToken.Word().Length == 0); // Check for compounds to split if (splitCompounds && nextToken is CoreLabel) { CoreLabel cl = (CoreLabel)nextToken; if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)) && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(FrenchLexer.CompoundAnnotation)) { nextToken = (T)ProcessCompound(cl); } } // Check for contractions to split if (splitContractions && nextToken is CoreLabel) { CoreLabel cl = (CoreLabel)nextToken; if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)) && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(FrenchLexer.ContrAnnotation)) { nextToken = (T)ProcessContraction(cl); } } return(nextToken); } catch (IOException e) { throw new RuntimeIOException(e); } }
/// <summary> /// Handles verbs with attached suffixes, marked by the lexer: /// Escribamosela => Escribamo + se + la => escribamos + se + la /// Sentaos => senta + os => sentad + os /// Damelo => da + me + lo /// </summary> private CoreLabel ProcessVerb(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); SpanishVerbStripper.StrippedVerb stripped = verbStripper.SeparatePronouns(cl.Word()); if (stripped == null) { return(cl); } // Split the CoreLabel into separate labels, tracking changing begin + end // positions. int stemEnd = cl.BeginPosition() + stripped.GetOriginalStem().Length; int lengthRemoved = 0; foreach (string pronoun in stripped.GetPronouns()) { int beginOffset = stemEnd + lengthRemoved; compoundBuffer.Add(CopyCoreLabel(cl, pronoun, beginOffset)); lengthRemoved += pronoun.Length; } CoreLabel stem = CopyCoreLabel(cl, stripped.GetStem(), cl.BeginPosition(), stemEnd); stem.SetOriginalText(stripped.GetOriginalStem()); return(stem); }
public static void WriteConllFile(string outFile, IList <ICoreMap> sentences, IList <DependencyTree> trees) { try { PrintWriter output = IOUtils.GetPrintWriter(outFile); for (int i = 0; i < sentences.Count; i++) { ICoreMap sentence = sentences[i]; DependencyTree tree = trees[i]; IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); for (int j = 1; j <= size; ++j) { CoreLabel token = tokens[j - 1]; output.Printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.Word(), token.Tag(), token.Tag(), tree.GetHead(j), tree.GetLabel(j)); } output.Println(); } output.Close(); } catch (Exception e) { throw new RuntimeIOException(e); } }
// && !text.contains("+") && // !text.contains("*");// && ! // text.contains("$") && !text.contains("\""); public static IDictionary <int, ISet> GetPatternsAroundTokens(DataInstance sent, ICollection <CandidatePhrase> stopWords) { IDictionary <int, ISet> p = new Dictionary <int, ISet>(); IList <CoreLabel> tokens = sent.GetTokens(); for (int i = 0; i < tokens.Count; i++) { // p.put( // i, // new Triple<Set<Integer>, Set<Integer>, Set<Integer>>( // new HashSet<Integer>(), new HashSet<Integer>(), // new HashSet<Integer>())); p[i] = new HashSet <SurfacePattern>(); CoreLabel token = tokens[i]; // do not create patterns around stop words! if (PatternFactory.DoNotUse(token.Word(), stopWords)) { continue; } ICollection <SurfacePattern> pat = GetContext(sent.GetTokens(), i, stopWords); p[i] = pat; } return(p); }
public Debinarizer(bool forceCNF) : this(forceCNF, CoreLabel.Factory()) { }
private static bool LemmaExists(CoreLabel l) { return(l.Lemma() != null && !l.Lemma().IsEmpty()); }
/// <exception cref="System.Exception"/> public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); try { ICollection <CandidatePhrase> alreadyLabeledPhrases = new HashSet <CandidatePhrase>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent)); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory m.SetBranchLimit(5); while (m.Find()) { int s = m.Start("$term"); int e = m.End("$term"); System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label " + label); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // not needed as initialized false for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } SurfacePattern pSur = (SurfacePattern)pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); phrase = phrase.Trim(); if (!phrase.IsEmpty()) { phraseLemma = phraseLemma.Trim(); CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma); allFreq.IncrementCount(candPhrase, pEn.Value, 1.0); if (!useWordNotLabeled) { alreadyLabeledPhrases.Add(candPhrase); } } } } } } return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases)); } catch (Exception e) { logger.Error(e); throw; } }
private static CoreLabel CopyCoreLabel(CoreLabel cl, string part, int beginPosition) { return(CopyCoreLabel(cl, part, beginPosition, beginPosition + part.Length)); }
/*/** * Simple tree reading utility method. Given a tree formatted as a PTB string, returns a Tree made by a specific TreeFactory. #1# public static Tree readTree(string ptbTreeString, TreeFactory treeFactory) { try { PennTreeReader ptr = new PennTreeReader(new StringReader(ptbTreeString), treeFactory); return ptr.readTree(); } catch (IOException ex) { throw new SystemException(ex); } }*/ /** * Simple tree reading utility method. Given a tree formatted as a PTB string, returns a Tree made by the default TreeFactory (LabeledScoredTreeFactory) */ /*public static Tree readTree(string str) { return readTree(str, defaultTreeFactory); }*/ /// <summary> /// Converts the tree labels to CoreLabels. /// We need this because we store additional info in the CoreLabel, like token span. /// </summary> public static void ConvertToCoreLabels(Tree tree) { ILabel l = tree.Label(); if (!(l is CoreLabel)) { var cl = new CoreLabel(); cl.SetValue(l.Value()); tree.SetLabel(cl); } foreach (Tree kid in tree.Children()) { ConvertToCoreLabels(kid); } }
//attribute conversational mentions: assign the mention to the same quote as the //if quote X has not been labelled, has no add'l text, and quote X-2 has been labelled, and quotes X-2, X-1, and X are consecutive in paragraph, //and X-1's quote does not refer to a name: //give quote X the same mention as X-2. public override void DoQuoteToMention(Annotation doc) { IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int index = 2; index < quotes.Count; index++) { ICoreMap currQuote = quotes[index]; ICoreMap prevQuote = quotes[index - 1]; ICoreMap twoPrevQuote = quotes[index - 2]; int twoPrevPara = GetQuoteParagraph(twoPrevQuote); //default to first in quote that begins n-2 for (int i = index - 3; i >= 0; i--) { if (GetQuoteParagraph(quotes[i]) == twoPrevPara) { twoPrevQuote = quotes[i]; } else { break; } } int tokenBeginIdx = currQuote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); int tokenEndIdx = currQuote.Get(typeof(CoreAnnotations.TokenEndAnnotation)); ICoreMap currQuoteBeginSentence = sentences[currQuote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))]; bool isAloneInParagraph = true; if (tokenBeginIdx > 0) { CoreLabel prevToken = tokens[tokenBeginIdx - 1]; ICoreMap prevSentence = sentences[prevToken.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))]; if (prevSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)).Equals(currQuoteBeginSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)))) { isAloneInParagraph = false; } } if (tokenEndIdx < tokens.Count - 1) { // if the next token is *NL*, it won't be in a sentence (if newlines have been tokenized) // so advance to the next non *NL* toke CoreLabel currToken = tokens[tokenEndIdx + 1]; while (currToken.IsNewline() && tokenEndIdx + 1 < tokens.Count - 1) { tokenEndIdx++; currToken = tokens[tokenEndIdx + 1]; } if (!currToken.IsNewline()) { ICoreMap nextSentence = sentences[currToken.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))]; if (nextSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)).Equals(currQuoteBeginSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)))) { isAloneInParagraph = false; } } } if (twoPrevQuote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) == null || !isAloneInParagraph || currQuote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null || twoPrevQuote.Get(typeof(QuoteAttributionAnnotator.MentionTypeAnnotation )).Equals(Sieve.Pronoun)) { continue; } if (GetQuoteParagraph(currQuote) == GetQuoteParagraph(prevQuote) + 1 && GetQuoteParagraph(prevQuote) == GetQuoteParagraph(twoPrevQuote) + 1) { FillInMention(currQuote, GetMentionData(twoPrevQuote), sieveName); } } }
/// <summary> /// Create a new <code>TreeGraphNode</code> having the same tree structure /// and label values as an existing tree (but no shared storage). /// Operates recursively to construct an entire subtree /// </summary> /// <param name="t">the tree to copy</param> /// <param name="parent">the parent node</param> protected TreeGraphNode(Tree t, TreeGraphNode parent) { this._parent = parent; Tree[] tKids = t.Children(); int numKids = tKids.Length; _children = new TreeGraphNode[numKids]; for (int i = 0; i < numKids; i++) { _children[i] = new TreeGraphNode(tKids[i], this); if (t.IsPreTerminal()) { // add the tags to the leaves _children[i]._label.SetTag(t.Label().Value()); } } this._label = (CoreLabel) Mlf.NewLabel(t.Label()); }
/// <summary> /// Sets the label associated with the current node /// </summary> public void SetLabel(CoreLabel label) { this._label = label; }
public string ToString(CoreLabel.OutputFormat format) { return _label.ToString(format); }
/// <summary> /// Create a new <code>TreeGraphNode</code> with the supplied label /// </summary> /// <param name="label">the label for this node</param> public TreeGraphNode(ILabel label) { this._label = (CoreLabel) Mlf.NewLabel(label); }