Exemple #1
0
        //is EnglishPU
        public virtual ICollection <string> FeaturesC(PaddedList <IN> cInfo, int loc)
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            c1       = cInfo[loc + 1];
            CoreLabel            c2       = cInfo[loc + 2];
            CoreLabel            c3       = cInfo[loc + 3];
            CoreLabel            p        = cInfo[loc - 1];
            CoreLabel            p2       = cInfo[loc - 2];
            CoreLabel            p3       = cInfo[loc - 3];
            string charc  = c.Get(typeof(CoreAnnotations.CharAnnotation));
            string charc1 = c1.Get(typeof(CoreAnnotations.CharAnnotation));
            string charc2 = c2.Get(typeof(CoreAnnotations.CharAnnotation));
            string charc3 = c3.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp  = p.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp3 = p3.Get(typeof(CoreAnnotations.CharAnnotation));

            if (flags.useWord1)
            {
                // features.add(charc +"c");
                // features.add(charc1+"c1");
                // features.add(charp +"p");
                // features.add(charp +charc  +"pc");
                // if(flags.useAs || flags.useMsr || flags.usePk || flags.useHk){ //msr, as
                //   features.add(charc +charc1 +"cc1");
                //   features.add(charp + charc1 +"pc1");
                // }
                features.Add(charc + "::c");
                features.Add(charc1 + "::c1");
                features.Add(charp + "::p");
                features.Add(charp2 + "::p2");
                // trying to restore the features that Huishin described in SIGHAN 2005 paper
                features.Add(charc + charc1 + "::cn");
                features.Add(charp + charc + "::pc");
                features.Add(charp + charc1 + "::pn");
                features.Add(charp2 + charp + "::p2p");
                features.Add(charp2 + charc + "::p2c");
                features.Add(charc2 + charc + "::n2c");
                features.Add("|word1");
            }
            return(features);
        }
Exemple #2
0
        //Note: this doesn't necessarily find all possible candidates, but is kind of a greedy version.
        // E.g. "Elizabeth and Jane" will return only "Elizabeth and Jane", but not "Elizabeth", and "Jane" as well.
        public virtual Pair <List <string>, List <Pair <int, int> > > ScanForNamesNew(Pair <int, int> textRun)
        {
            List <string>           potentialNames = new List <string>();
            List <Pair <int, int> > nameIndices    = new List <Pair <int, int> >();
            IList <CoreLabel>       tokens         = doc.Get(typeof(CoreAnnotations.TokensAnnotation));

            Sieve.TokenNode pointer = rootNameNode;
            for (int index = textRun.first; index <= textRun.second && index < tokens.Count; index++)
            {
                CoreLabel token     = tokens[index];
                string    tokenText = token.Word();
                //      System.out.println(token);
                if (pointer.childNodes.Keys.Contains(tokenText))
                {
                    pointer = pointer.childNodes[tokenText];
                }
                else
                {
                    if (!pointer.token.Equals("$ROOT"))
                    {
                        if (pointer.fullName != null)
                        {
                            potentialNames.Add(pointer.fullName);
                            nameIndices.Add(new Pair <int, int>(index - 1 - pointer.level, index - 1));
                        }
                        pointer = rootNameNode;
                    }
                }
            }
            int index_1 = textRun.second + 1;

            if (!pointer.token.Equals("$ROOT"))
            {
                //catch the end case
                if (pointer.fullName != null)
                {
                    potentialNames.Add(pointer.fullName);
                    nameIndices.Add(new Pair <int, int>(index_1 - 1 - pointer.level, index_1 - 1));
                }
                pointer = rootNameNode;
            }
            return(new Pair <List <string>, List <Pair <int, int> > >(potentialNames, nameIndices));
        }
        private static void ReplacePOSTag(Tree t, MorphoFeatureSpecification morpho)
        {
            if (!t.IsPreTerminal())
            {
                throw new ArgumentException("Can only operate on preterminals");
            }
            if (!(t.Label() is CoreLabel))
            {
                throw new ArgumentException("Only operates on CoreLabels");
            }
            CoreLabel label = (CoreLabel)t.Label();
            Tree      child = t.Children()[0];

            if (!(child.Label() is CoreLabel))
            {
                throw new ArgumentException("Only operates on CoreLabels");
            }
            CoreLabel childLabel = (CoreLabel)child.Label();
            // Morphological Analysis
            string morphStr = childLabel.OriginalText();

            if (morphStr == null || morphStr.Equals(string.Empty))
            {
                morphStr = label.Value();
                // POS subcategory
                string subCat = childLabel.Category();
                if (subCat != null && subCat != string.Empty)
                {
                    morphStr += "-" + subCat + "--";
                }
                else
                {
                    morphStr += "---";
                }
            }
            MorphoFeatures feats = morpho.StrToFeatures(morphStr);

            if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty))
            {
                label.SetValue(feats.GetAltTag());
                label.SetTag(feats.GetAltTag());
            }
        }
Exemple #4
0
        /// <summary>The main output here is data/tokens/dickens.oliver.tokens, which contains the original book, one token per line, with part of speech, syntax, NER, coreference and other annotations.</summary>
        /// <remarks>
        /// The main output here is data/tokens/dickens.oliver.tokens, which contains the original book, one token per line, with part of speech, syntax, NER, coreference and other annotations. The (tab-separated) format is:
        /// Paragraph id
        /// Sentence id
        /// Token id
        /// Byte start
        /// Byte end
        /// Whitespace following the token (useful for pretty-printing the original text)
        /// Syntactic head id (-1 for the sentence root)
        /// Original token
        /// Normalized token (for quotes etc.)
        /// Lemma
        /// Penn Treebank POS tag
        /// NER tag (PERSON, NUMBER, DATE, DURATION, MISC, TIME, LOCATION, ORDINAL, MONEY, ORGANIZATION, SET, O)
        /// Stanford basic dependency label
        /// Within-quotation flag
        /// Character id (all coreferent tokens share the same character id)
        /// </remarks>
        /// <param name="filename"/>
        public static IDictionary <int, IList <CoreLabel> > ReadTokenFile(string filename, Annotation novel)
        {
            IList <string> lines = IOUtils.LinesFromFile(filename);
            IDictionary <int, IList <CoreLabel> > charsToTokens = new Dictionary <int, IList <CoreLabel> >();
            bool first       = true;
            int  tokenOffset = 0;

            foreach (string line in lines)
            {
                if (first)
                {
                    first = false;
                    continue;
                }
                string[]  pieces        = line.Split("\t");
                int       tokenId       = System.Convert.ToInt32(pieces[2]) + tokenOffset;
                string    token         = pieces[7];
                string    normalizedTok = pieces[8];
                int       characterId   = System.Convert.ToInt32(pieces[14]);
                CoreLabel novelTok      = novel.Get(typeof(CoreAnnotations.TokensAnnotation))[tokenId];
                // CoreNLP sometimes splits ". . . ." as ". . ." and "." and sometimes lemmatizes it. (The Steppe)
                if (pieces[7].Equals(". . . .") && !novelTok.Get(typeof(CoreAnnotations.OriginalTextAnnotation)).Equals(". . . ."))
                {
                    tokenOffset++;
                }
                if (characterId != -1)
                {
                    if (!novelTok.Get(typeof(CoreAnnotations.TextAnnotation)).Equals(normalizedTok))
                    {
                        System.Console.Error.WriteLine(token + " != " + novelTok.Get(typeof(CoreAnnotations.TextAnnotation)));
                    }
                    else
                    {
                        if (!charsToTokens.Contains(characterId))
                        {
                            charsToTokens[characterId] = new List <CoreLabel>();
                        }
                        charsToTokens[characterId].Add(novelTok);
                    }
                }
            }
            return(charsToTokens);
        }
 /// <summary>
 /// Determine if the given tree contains a leaf which matches the
 /// part-of-speech and lexical criteria.
 /// </summary>
 /// <param name="pos">
 /// Regular expression to match part of speech (may be null,
 /// in which case any POS is allowed)
 /// </param>
 /// <param name="pos">
 /// Regular expression to match word (may be null, in which
 /// case any word is allowed)
 /// </param>
 private static bool ShouldPrintTree(Tree tree, Pattern pos, Pattern word)
 {
     foreach (Tree t in tree)
     {
         if (t.IsPreTerminal())
         {
             CoreLabel label     = (CoreLabel)t.Label();
             string    tpos      = label.Value();
             Tree      wordNode  = t.FirstChild();
             CoreLabel wordLabel = (CoreLabel)wordNode.Label();
             string    tword     = wordLabel.Value();
             if ((pos == null || pos.Matcher(tpos).Find()) && (word == null || word.Matcher(tword).Find()))
             {
                 return(true);
             }
         }
     }
     return(false);
 }
        public virtual void TestUsingIterator()
        {
            string s = "\n\n@@123\nthis\nis\na\nsentence\n\n@@12\nThis\nis another\n.\n\n";

            string[] output = new string[] { "@@", "123", "this", "is", "a", "sentence", "@@", "12", "This", "is", "another", "." };
            string[] outWSs = new string[] { "@@", "ddd", "xxxx", "xx", "x", "xxxxx", "@@", "dd", "Xxxx", "xx", "xxxxx", "." };
            NUnit.Framework.Assert.AreEqual(output.Length, outWSs.Length, "Two output arrays should have same length");
            Properties         props = PropertiesUtils.AsProperties("wordShape", "chris2");
            SeqClassifierFlags flags = new SeqClassifierFlags(props);
            PlainTextDocumentReaderAndWriter <CoreLabel> readerAndWriter = new PlainTextDocumentReaderAndWriter <CoreLabel>();

            readerAndWriter.Init(flags);
            ReaderIteratorFactory           rif          = new ReaderIteratorFactory(new StringReader(s));
            ObjectBank <IList <CoreLabel> > di           = new ObjectBank <IList <CoreLabel> >(rif, readerAndWriter);
            ICollection <string>            knownLCWords = new HashSet <string>();
            ObjectBankWrapper <CoreLabel>   obw          = new ObjectBankWrapper <CoreLabel>(flags, di, knownLCWords);

            try
            {
                int outIdx = 0;
                for (IEnumerator <IList <CoreLabel> > iter = obw.GetEnumerator(); iter.MoveNext();)
                {
                    IList <CoreLabel> sent = iter.Current;
                    for (IEnumerator <CoreLabel> iter2 = sent.GetEnumerator(); iter2.MoveNext();)
                    {
                        CoreLabel cl    = iter2.Current;
                        string    tok   = cl.Word();
                        string    shape = cl.Get(typeof(CoreAnnotations.ShapeAnnotation));
                        NUnit.Framework.Assert.AreEqual(output[outIdx], tok);
                        NUnit.Framework.Assert.AreEqual(outWSs[outIdx], shape);
                        outIdx++;
                    }
                }
                if (outIdx < output.Length)
                {
                    NUnit.Framework.Assert.Fail("Too few things in iterator, lacking: " + output[outIdx]);
                }
            }
            catch (Exception e)
            {
                NUnit.Framework.Assert.Fail("Probably too many things in iterator: " + e);
            }
        }
Exemple #7
0
        /// <summary>Splits a compound marked by the lexer.</summary>
        private CoreLabel ProcessCompound(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            string[] parts       = pSpace.Split(pDash.Matcher(cl.Word()).ReplaceAll(" - "));
            int      lengthAccum = 0;

            foreach (string part in parts)
            {
                CoreLabel newLabel = new CoreLabel(cl);
                newLabel.SetWord(part);
                newLabel.SetValue(part);
                newLabel.SetBeginPosition(cl.BeginPosition() + lengthAccum);
                newLabel.SetEndPosition(cl.BeginPosition() + lengthAccum + part.Length);
                newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part);
                compoundBuffer.Add(newLabel);
                lengthAccum += part.Length;
            }
            return(compoundBuffer.Remove(0));
        }
Exemple #8
0
        // end featuresCpCp2C
        protected internal virtual ICollection <string> FeaturesCpCp2Cp3C <_T0>(PaddedList <_T0> cInfo, int loc)
            where _T0 : CoreLabel
        {
            ICollection <string> features = new List <string>();

            if (flags.use4Clique && flags.maxLeft >= 3)
            {
                CoreLabel c       = cInfo[loc];
                CoreLabel c2      = cInfo[loc + 1];
                CoreLabel p       = cInfo[loc - 1];
                CoreLabel p2      = cInfo[loc - 2];
                CoreLabel p3      = cInfo[loc - 3];
                string    charc   = c.GetString <CoreAnnotations.CharAnnotation>();
                string    charp   = p.GetString <CoreAnnotations.CharAnnotation>();
                string    charp2  = p2.GetString <CoreAnnotations.CharAnnotation>();
                string    charp3  = p3.GetString <CoreAnnotations.CharAnnotation>();
                int       cI      = c.Get(typeof(CoreAnnotations.UTypeAnnotation));
                string    uTypec  = (cI != null ? cI.ToString() : string.Empty);
                int       c2I     = c2.Get(typeof(CoreAnnotations.UTypeAnnotation));
                string    uTypec2 = (c2I != null ? c2I.ToString() : string.Empty);
                int       pI      = p.Get(typeof(CoreAnnotations.UTypeAnnotation));
                string    uTypep  = (pI != null ? pI.ToString() : string.Empty);
                int       p2I     = p2.Get(typeof(CoreAnnotations.UTypeAnnotation));
                string    uTypep2 = (p2I != null ? p2I.ToString() : string.Empty);
                int       p3I     = p3.Get(typeof(CoreAnnotations.UTypeAnnotation));
                string    uTypep3 = (p3I != null ? p3I.ToString() : string.Empty);
                if (flags.useLongSequences)
                {
                    features.Add(charp3 + charp2 + charp + charc + "p3p2pc");
                }
                if (flags.useUnicodeType4gram || flags.useUnicodeType5gram)
                {
                    features.Add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-uType4");
                }
                if (flags.useUnicodeType5gram)
                {
                    features.Add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType5");
                }
                features.Add("cliqueCpCp2Cp3C");
            }
            return(features);
        }
 // public void getDecisionTree(Map<String, List<CoreLabel>> sents,
 // List<Pair<String, Integer>> chosen, Counter<String> weights, String
 // wekaOptions) {
 // RVFDataset<String, String> dataset = new RVFDataset<String, String>();
 // for (Pair<String, Integer> d : chosen) {
 // CoreLabel l = sents.get(d.first).get(d.second());
 // String w = l.word();
 // Integer num = this.clusterIds.get(w);
 // if (num == null)
 // num = -1;
 // double wt = weights.getCount("Cluster-" + num);
 // String label;
 // if (l.get(answerClass).toString().equals(answerLabel))
 // label = answerLabel;
 // else
 // label = "O";
 // Counter<String> feat = new ClassicCounter<String>();
 // feat.setCount("DIST", wt);
 // dataset.add(new RVFDatum<String, String>(feat, label));
 // }
 // WekaDatumClassifierFactory wekaFactory = new
 // WekaDatumClassifierFactory("weka.classifiers.trees.J48", wekaOptions);
 // WekaDatumClassifier classifier = wekaFactory.trainClassifier(dataset);
 // Classifier cls = classifier.getClassifier();
 // J48 j48decisiontree = (J48) cls;
 // System.out.println(j48decisiontree.toSummaryString());
 // System.out.println(j48decisiontree.toString());
 //
 // }
 private int Sample(IDictionary <string, DataInstance> sents, Random r, Random rneg, double perSelectNeg, double perSelectRand, int numrand, IList <Pair <string, int> > chosen, RVFDataset <string, string> dataset)
 {
     foreach (KeyValuePair <string, DataInstance> en in sents)
     {
         CoreLabel[] sent = Sharpen.Collections.ToArray(en.Value.GetTokens(), new CoreLabel[0]);
         for (int i = 0; i < sent.Length; i++)
         {
             CoreLabel l          = sent[i];
             bool      chooseThis = false;
             if (l.Get(answerClass).Equals(answerLabel))
             {
                 chooseThis = true;
             }
             else
             {
                 if ((!l.Get(answerClass).Equals("O") || negativeWords.Contains(l.Word().ToLower())) && GetRandomBoolean(r, perSelectNeg))
                 {
                     chooseThis = true;
                 }
                 else
                 {
                     if (GetRandomBoolean(r, perSelectRand))
                     {
                         numrand++;
                         chooseThis = true;
                     }
                     else
                     {
                         chooseThis = false;
                     }
                 }
             }
             if (chooseThis)
             {
                 chosen.Add(new Pair(en.Key, i));
                 RVFDatum <string, string> d = GetDatum(sent, i);
                 dataset.Add(d, en.Key, int.ToString(i));
             }
         }
     }
     return(numrand);
 }
Exemple #10
0
        // static methods
        /// <summary>
        /// Sets the labels on the tree (except the leaves) to be the integer
        /// value of the sentiment prediction.
        /// </summary>
        /// <remarks>
        /// Sets the labels on the tree (except the leaves) to be the integer
        /// value of the sentiment prediction.  Makes it easy to print out
        /// with Tree.toString()
        /// </remarks>
        private static void SetSentimentLabels(Tree tree)
        {
            if (tree.IsLeaf())
            {
                return;
            }
            foreach (Tree child in tree.Children())
            {
                SetSentimentLabels(child);
            }
            ILabel label = tree.Label();

            if (!(label is CoreLabel))
            {
                throw new ArgumentException("Required a tree with CoreLabels");
            }
            CoreLabel cl = (CoreLabel)label;

            cl.SetValue(int.ToString(RNNCoreAnnotations.GetPredictedClass(tree)));
        }
        /// <exception cref="System.IO.IOException"/>
        public virtual void HandleLemma(string arg, OutputStream outStream)
        {
            if (arg == null)
            {
                return;
            }
            IList <CoreLabel>  tokens = parser.Lemmatize(arg);
            OutputStreamWriter osw    = new OutputStreamWriter(outStream, "utf-8");

            for (int i = 0; i < tokens.Count; ++i)
            {
                CoreLabel word = tokens[i];
                if (i > 0)
                {
                    osw.Write(" ");
                }
                osw.Write(word.Lemma());
            }
            osw.Write("\n");
            osw.Flush();
        }
Exemple #12
0
        /// <summary>Find the tree that covers the portion of interest.</summary>
        private static Tree FindPartialSpan(Tree root, int start)
        {
            CoreLabel label      = (CoreLabel)root.Label();
            int       startIndex = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation));

            if (startIndex == start)
            {
                return(root);
            }
            foreach (Tree kid in root.Children())
            {
                CoreLabel kidLabel = (CoreLabel)kid.Label();
                int       kidStart = kidLabel.Get(typeof(CoreAnnotations.BeginIndexAnnotation));
                int       kidEnd   = kidLabel.Get(typeof(CoreAnnotations.EndIndexAnnotation));
                if (kidStart <= start && kidEnd > start)
                {
                    return(FindPartialSpan(kid, start));
                }
            }
            throw new Exception("Shouldn't happen: " + start + " " + root);
        }
Exemple #13
0
        /// <summary>Find the index of the head of an entity.</summary>
        /// <param name="ent">The entity mention</param>
        /// <param name="tree">The Tree for the entire sentence in which it occurs.</param>
        /// <param name="tokens">The Sentence in which it occurs</param>
        /// <param name="setHeadSpan">Whether to set the head span in the entity mention.</param>
        /// <returns>The index of the entity head</returns>
        public virtual int AssignSyntacticHead(EntityMention ent, Tree tree, IList <CoreLabel> tokens, bool setHeadSpan)
        {
            if (ent.GetSyntacticHeadTokenPosition() != -1)
            {
                return(ent.GetSyntacticHeadTokenPosition());
            }
            logger.Finest("Finding syntactic head for entity: " + ent + " in tree: " + tree.ToString());
            logger.Finest("Flat sentence is: " + tokens);
            Tree sh = null;

            try
            {
                sh = FindSyntacticHead(ent, tree, tokens);
            }
            catch (Exception e)
            {
                logger.Severe("WARNING: failed to parse sentence. Will continue with the right-most head heuristic: " + SentenceToString(tokens));
                Sharpen.Runtime.PrintStackTrace(e);
            }
            int headPos = ent.GetExtentTokenEnd() - 1;

            if (sh != null)
            {
                CoreLabel label = (CoreLabel)sh.Label();
                headPos = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation));
            }
            else
            {
                logger.Fine("WARNING: failed to find syntactic head for entity: " + ent + " in tree: " + tree);
                logger.Fine("Fallback strategy: will set head to last token in mention: " + tokens[headPos]);
            }
            ent.SetHeadTokenPosition(headPos);
            if (setHeadSpan)
            {
                // set the head span to match exactly the syntactic head
                // this is needed for some corpora where the head span is not given
                ent.SetHeadTokenSpan(new Span(headPos, headPos + 1));
            }
            return(headPos);
        }
Exemple #14
0
        public virtual IList <CoreLabel> SegmentStringToTokenList(string line)
        {
            IList <CoreLabel> tokenList       = CollectionUtils.MakeList();
            IList <CoreLabel> labeledSequence = SegmentStringToIOB(line);

            foreach (IntPair span in IOBUtils.TokenSpansForIOB(labeledSequence))
            {
                CoreLabel token = new CoreLabel();
                string    text  = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.GetSource(), span.GetTarget());
                token.SetWord(text);
                token.SetValue(text);
                token.Set(typeof(CoreAnnotations.TextAnnotation), text);
                token.Set(typeof(CoreAnnotations.ArabicSegAnnotation), "1");
                int start = labeledSequence[span.GetSource()].BeginPosition();
                int end   = labeledSequence[span.GetTarget() - 1].EndPosition();
                token.SetOriginalText(Sharpen.Runtime.Substring(line, start, end));
                token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), start);
                token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end);
                tokenList.Add(token);
            }
            return(tokenList);
        }
Exemple #15
0
 public override ILabel Label()
 {
     // TODO: move this CoreLabel construction logic somewhere appropriate
     var cLabel = new CoreLabel();
     if (this.parse.IsLeaf)
     {
         cLabel.SetWord(this.parse.Value);
         cLabel.SetBeginPosition(this.parse.Span.Start);
         cLabel.SetEndPosition(this.parse.Span.End);
         cLabel.SetValue(this.parse.Value);
     }
     else
     {
         cLabel.SetCategory(this.parse.Type);
         cLabel.SetValue(this.parse.Type);
         if (this.Depth() == 1)
         {
             cLabel.SetTag(this.parse.Type);
         }
     }
     return cLabel;
 }
        /// <summary>This option also does not seem to help</summary>
        public virtual void AddEdgeFeatures2(IList <string> features, State state, string nodeName, Tree node)
        {
            if (node == null)
            {
                return;
            }
            int       left       = ShiftReduceUtils.LeftIndex(node);
            int       right      = ShiftReduceUtils.RightIndex(node);
            CoreLabel nodeLabel  = GetCoreLabel(node);
            string    nodeValue  = GetFeatureFromCoreLabel(nodeLabel, FeatureFactory.FeatureComponent.Value) + "-";
            CoreLabel leftLabel  = GetQueueLabel(state, left);
            CoreLabel rightLabel = GetQueueLabel(state, right);

            AddUnaryQueueFeatures(features, leftLabel, nodeName + "EL-" + nodeValue);
            AddUnaryQueueFeatures(features, rightLabel, nodeName + "ER-" + nodeValue);
            CoreLabel previousLabel = GetQueueLabel(state, left - 1);

            AddUnaryQueueFeatures(features, previousLabel, nodeName + "EP-" + nodeValue);
            CoreLabel nextLabel = GetQueueLabel(state, right + 1);

            AddUnaryQueueFeatures(features, nextLabel, nodeName + "EN-" + nodeValue);
        }
Exemple #17
0
        /// <summary>
        /// This is the original version of
        /// <see cref="FindSyntacticHead(Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention, Edu.Stanford.Nlp.Trees.Tree, System.Collections.Generic.IList{E})"/>
        /// before Chris's modifications.
        /// There's no good reason to use it except for producing historical results.
        /// It Finds the syntactic head of the given entity mention.
        /// </summary>
        /// <param name="ent">The entity mention</param>
        /// <param name="root">The Tree for the entire sentence in which it occurs.</param>
        /// <param name="tokens">The Sentence in which it occurs</param>
        /// <returns>
        /// The tree object corresponding to the head. This MUST be a child of root.
        /// It will be a leaf in the parse tree.
        /// </returns>
        public virtual Tree OriginalFindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens)
        {
            logger.Fine("Searching for tree matching " + ent);
            Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd());

            //
            // found an exact match
            //
            if (exactMatch != null)
            {
                logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch));
                return(SafeHead(exactMatch));
            }
            //
            // no exact match found
            // in this case, we parse the actual extent of the mention
            //
            IList <CoreLabel> extentTokens = new List <CoreLabel>();

            for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++)
            {
                extentTokens.Add(tokens[i]);
            }
            Tree tree = Parse(extentTokens);

            logger.Fine("No exact match found. Local parse:\n" + tree.PennString());
            ConvertToCoreLabels(tree);
            tree.IndexSpans(ent.GetExtentTokenStart());
            Tree extentHead = SafeHead(tree);

            System.Diagnostics.Debug.Assert((extentHead != null));
            // extentHead is a child in the local extent parse tree. we need to find the
            // corresponding node in the main tree
            CoreLabel l        = (CoreLabel)extentHead.Label();
            Tree      realHead = FindTreeWithSpan(root, l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), l.Get(typeof(CoreAnnotations.EndIndexAnnotation)));

            System.Diagnostics.Debug.Assert((realHead != null));
            return(realHead);
        }
Exemple #18
0
        public UnnamedDependency(string regent, string dependent)
        {
            if (regent == null || dependent == null)
            {
                throw new ArgumentException("governor or dependent cannot be null");
            }

            var headLabel = new CoreLabel();

            headLabel.SetValue(regent);
            headLabel.SetWord(regent);
            this._regent = headLabel;

            var depLabel = new CoreLabel();

            depLabel.SetValue(dependent);
            depLabel.SetWord(dependent);
            this._dependent = depLabel;

            RegentText    = regent;
            DependentText = dependent;
        }
Exemple #19
0
        private Tree FunkyFindLeafWithApproximateSpan(Tree root, string token, int index, int approximateness)
        {
            logger.Fine("Looking for " + token + " at pos " + index + " plus upto " + approximateness + " in tree: " + root.PennString());
            IList <Tree> leaves = root.GetLeaves();

            foreach (Tree leaf in leaves)
            {
                CoreLabel label = typeof(CoreLabel).Cast(leaf.Label());
                int       ind   = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation));
                // log.info("Token #" + ind + ": " + leaf.value());
                if (token.Equals(leaf.Value()) && ind >= index && ind <= index + approximateness)
                {
                    return(leaf);
                }
            }
            // this shouldn't happen
            // but it does happen (VERY RARELY) on some weird web text that includes SGML tags with spaces
            // TODO: does this mean that somehow tokenization is different for the parser? check this by throwing an Exception in KBP
            logger.Severe("GenericDataSetReader: WARNING: Failed to find head token");
            logger.Severe("  when looking for " + token + " at pos " + index + " plus upto " + approximateness + " in tree: " + root.PennString());
            return(null);
        }
Exemple #20
0
        private Tree FindPartialSpan(Tree current, int start)
        {
            CoreLabel label      = (CoreLabel)current.Label();
            int       startIndex = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation));

            if (startIndex == start)
            {
                logger.Fine("findPartialSpan: Returning " + current);
                return(current);
            }
            foreach (Tree kid in current.Children())
            {
                CoreLabel kidLabel = (CoreLabel)kid.Label();
                int       kidStart = kidLabel.Get(typeof(CoreAnnotations.BeginIndexAnnotation));
                int       kidEnd   = kidLabel.Get(typeof(CoreAnnotations.EndIndexAnnotation));
                // log.info("findPartialSpan: Examining " + kidLabel.value() + " from " + kidStart + " to " + kidEnd);
                if (kidStart <= start && kidEnd > start)
                {
                    return(FindPartialSpan(kid, start));
                }
            }
            throw new Exception("Shouldn't happen: " + start + " " + current);
        }
        protected internal override ICollection <string> FeaturesCpC(PaddedList <IN> cInfo, int loc)
        {
            ICollection <string> features = base.FeaturesCpC(cInfo, loc);
            CoreLabel            c        = cInfo[loc];

            // "Wrapper" feature: identity of first and last two chars of the current word.
            // This helps detect ma+_+sh in dialect, as well as avoiding segmenting possessive
            // pronouns if the word starts with al-.
            if (c.Word().Length > 3)
            {
                string start = Sharpen.Runtime.Substring(c.Word(), 0, 2);
                string end   = Sharpen.Runtime.Substring(c.Word(), c.Word().Length - 2);
                if (c.Index() == 2)
                {
                    features.Add(start + "_" + end + "-begin-wrap");
                }
                if (c.Index() == c.Word().Length - 1)
                {
                    features.Add(start + "_" + end + "-end-wrap");
                }
            }
            return(features);
        }
Exemple #22
0
        public override ILabel Label()
        {
            // TODO: move this CoreLabel construction logic somewhere appropriate
            var cLabel = new CoreLabel();

            if (this.parse.IsLeaf)
            {
                cLabel.SetWord(this.parse.Value);
                cLabel.SetBeginPosition(this.parse.Span.Start);
                cLabel.SetEndPosition(this.parse.Span.End);
                cLabel.SetValue(this.parse.Value);
            }
            else
            {
                cLabel.SetCategory(this.parse.Type);
                cLabel.SetValue(this.parse.Type);
                if (this.Depth() == 1)
                {
                    cLabel.SetTag(this.parse.Type);
                }
            }
            return(cLabel);
        }
Exemple #23
0
 protected internal override T GetNext()
 {
     try
     {
         T nextToken = null;
         do
         {
             // Depending on the orthographic normalization options,
             // some tokens can be obliterated. In this case, keep iterating
             // until we see a non-zero length token.
             nextToken = ((splitContractions || splitCompounds) && compoundBuffer.Count > 0) ? (T)compoundBuffer.Remove(0) : (T)lexer.Next();
         }while (nextToken != null && nextToken.Word().Length == 0);
         // Check for compounds to split
         if (splitCompounds && nextToken is CoreLabel)
         {
             CoreLabel cl = (CoreLabel)nextToken;
             if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)) && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(FrenchLexer.CompoundAnnotation))
             {
                 nextToken = (T)ProcessCompound(cl);
             }
         }
         // Check for contractions to split
         if (splitContractions && nextToken is CoreLabel)
         {
             CoreLabel cl = (CoreLabel)nextToken;
             if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)) && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(FrenchLexer.ContrAnnotation))
             {
                 nextToken = (T)ProcessContraction(cl);
             }
         }
         return(nextToken);
     }
     catch (IOException e)
     {
         throw new RuntimeIOException(e);
     }
 }
Exemple #24
0
        /// <summary>
        /// Handles verbs with attached suffixes, marked by the lexer:
        /// Escribamosela =&gt; Escribamo + se + la =&gt; escribamos + se + la
        /// Sentaos =&gt; senta + os =&gt; sentad + os
        /// Damelo =&gt; da + me + lo
        /// </summary>
        private CoreLabel ProcessVerb(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            SpanishVerbStripper.StrippedVerb stripped = verbStripper.SeparatePronouns(cl.Word());
            if (stripped == null)
            {
                return(cl);
            }
            // Split the CoreLabel into separate labels, tracking changing begin + end
            // positions.
            int stemEnd       = cl.BeginPosition() + stripped.GetOriginalStem().Length;
            int lengthRemoved = 0;

            foreach (string pronoun in stripped.GetPronouns())
            {
                int beginOffset = stemEnd + lengthRemoved;
                compoundBuffer.Add(CopyCoreLabel(cl, pronoun, beginOffset));
                lengthRemoved += pronoun.Length;
            }
            CoreLabel stem = CopyCoreLabel(cl, stripped.GetStem(), cl.BeginPosition(), stemEnd);

            stem.SetOriginalText(stripped.GetOriginalStem());
            return(stem);
        }
Exemple #25
0
 public static void WriteConllFile(string outFile, IList <ICoreMap> sentences, IList <DependencyTree> trees)
 {
     try
     {
         PrintWriter output = IOUtils.GetPrintWriter(outFile);
         for (int i = 0; i < sentences.Count; i++)
         {
             ICoreMap          sentence = sentences[i];
             DependencyTree    tree     = trees[i];
             IList <CoreLabel> tokens   = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             for (int j = 1; j <= size; ++j)
             {
                 CoreLabel token = tokens[j - 1];
                 output.Printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.Word(), token.Tag(), token.Tag(), tree.GetHead(j), tree.GetLabel(j));
             }
             output.Println();
         }
         output.Close();
     }
     catch (Exception e)
     {
         throw new RuntimeIOException(e);
     }
 }
Exemple #26
0
        // && !text.contains("+") &&
        // !text.contains("*");// && !
        // text.contains("$") && !text.contains("\"");
        public static IDictionary <int, ISet> GetPatternsAroundTokens(DataInstance sent, ICollection <CandidatePhrase> stopWords)
        {
            IDictionary <int, ISet> p      = new Dictionary <int, ISet>();
            IList <CoreLabel>       tokens = sent.GetTokens();

            for (int i = 0; i < tokens.Count; i++)
            {
                //          p.put(
                //              i,
                //              new Triple<Set<Integer>, Set<Integer>, Set<Integer>>(
                //                  new HashSet<Integer>(), new HashSet<Integer>(),
                //                  new HashSet<Integer>()));
                p[i] = new HashSet <SurfacePattern>();
                CoreLabel token = tokens[i];
                // do not create patterns around stop words!
                if (PatternFactory.DoNotUse(token.Word(), stopWords))
                {
                    continue;
                }
                ICollection <SurfacePattern> pat = GetContext(sent.GetTokens(), i, stopWords);
                p[i] = pat;
            }
            return(p);
        }
Exemple #27
0
 public Debinarizer(bool forceCNF)
     : this(forceCNF, CoreLabel.Factory())
 {
 }
 private static bool LemmaExists(CoreLabel l)
 {
     return(l.Lemma() != null && !l.Lemma().IsEmpty());
 }
 /// <exception cref="System.Exception"/>
 public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call()
 {
     // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
     // CollectionValuedMap<String, Integer>();
     try
     {
         ICollection <CandidatePhrase> alreadyLabeledPhrases                    = new HashSet <CandidatePhrase>();
         TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
         CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
         foreach (string sentid in sentids)
         {
             IList <CoreLabel> sent = sents[sentid].GetTokens();
             foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns)
             {
                 if (pEn.Key == null)
                 {
                     throw new Exception("why is the pattern " + pEn + " null?");
                 }
                 TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent));
                 //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                 //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                 //Higher branch values makes the faster but uses more memory
                 m.SetBranchLimit(5);
                 while (m.Find())
                 {
                     int s = m.Start("$term");
                     int e = m.End("$term");
                     System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label "
                                                     + label);
                     string phrase            = string.Empty;
                     string phraseLemma       = string.Empty;
                     bool   useWordNotLabeled = false;
                     bool   doNotUse          = false;
                     //find if the neighboring words are labeled - if so - club them together
                     if (constVars.clubNeighboringLabeledWords)
                     {
                         for (int i = s - 1; i >= 0; i--)
                         {
                             if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 s = i + 1;
                                 break;
                             }
                         }
                         for (int i_1 = e; i_1 < sent.Count; i_1++)
                         {
                             if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 e = i_1;
                                 break;
                             }
                         }
                     }
                     //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                     bool[] addedindices = new bool[e - s];
                     // Arrays.fill(addedindices, false); // not needed as initialized false
                     for (int i_2 = s; i_2 < e; i_2++)
                     {
                         CoreLabel l = sent[i_2];
                         l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                         if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                         {
                             l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                         }
                         SurfacePattern pSur = (SurfacePattern)pEn.Value;
                         System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                         System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                         l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                         foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                         {
                             if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                             {
                                 doNotUse = true;
                             }
                         }
                         bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                         if (removePhrasesWithStopWords && containsStop)
                         {
                             doNotUse = true;
                         }
                         else
                         {
                             if (!containsStop || !removeStopWordsFromSelectedPhrases)
                             {
                                 if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                 {
                                     useWordNotLabeled = true;
                                 }
                                 phrase               += " " + l.Word();
                                 phraseLemma          += " " + l.Lemma();
                                 addedindices[i_2 - s] = true;
                             }
                         }
                     }
                     for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                     {
                         if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                         {
                             doNotUse = true;
                             break;
                         }
                     }
                     if (!doNotUse)
                     {
                         matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                         phrase = phrase.Trim();
                         if (!phrase.IsEmpty())
                         {
                             phraseLemma = phraseLemma.Trim();
                             CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma);
                             allFreq.IncrementCount(candPhrase, pEn.Value, 1.0);
                             if (!useWordNotLabeled)
                             {
                                 alreadyLabeledPhrases.Add(candPhrase);
                             }
                         }
                     }
                 }
             }
         }
         return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases));
     }
     catch (Exception e)
     {
         logger.Error(e);
         throw;
     }
 }
Exemple #30
0
 private static CoreLabel CopyCoreLabel(CoreLabel cl, string part, int beginPosition)
 {
     return(CopyCoreLabel(cl, part, beginPosition, beginPosition + part.Length));
 }
Exemple #31
0
        /*/**
   * Simple tree reading utility method.  Given a tree formatted as a PTB string, returns a Tree made by a specific TreeFactory.
   #1#
  public static Tree readTree(string ptbTreeString, TreeFactory treeFactory) {
    try {
      PennTreeReader ptr = new PennTreeReader(new StringReader(ptbTreeString), treeFactory);
      return ptr.readTree();
    } catch (IOException ex) {
      throw new SystemException(ex);
    }
  }*/

        /**
   * Simple tree reading utility method.  Given a tree formatted as a PTB string, returns a Tree made by the default TreeFactory (LabeledScoredTreeFactory)
   */
        /*public static Tree readTree(string str) {
    return readTree(str, defaultTreeFactory);
  }*/

        /// <summary>
        /// Converts the tree labels to CoreLabels.
        /// We need this because we store additional info in the CoreLabel, like token span.
        /// </summary>
        public static void ConvertToCoreLabels(Tree tree)
        {
            ILabel l = tree.Label();
            if (!(l is CoreLabel))
            {
                var cl = new CoreLabel();
                cl.SetValue(l.Value());
                tree.SetLabel(cl);
            }

            foreach (Tree kid in tree.Children())
            {
                ConvertToCoreLabels(kid);
            }
        }
Exemple #32
0
        //attribute conversational mentions: assign the mention to the same quote as the
        //if quote X has not been labelled, has no add'l text, and quote X-2 has been labelled, and quotes X-2, X-1, and X are consecutive in paragraph,
        //and X-1's quote does not refer to a name:
        //give quote X the same mention as X-2.
        public override void DoQuoteToMention(Annotation doc)
        {
            IList <ICoreMap>  quotes    = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));
            IList <CoreLabel> tokens    = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <ICoreMap>  sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));

            for (int index = 2; index < quotes.Count; index++)
            {
                ICoreMap currQuote    = quotes[index];
                ICoreMap prevQuote    = quotes[index - 1];
                ICoreMap twoPrevQuote = quotes[index - 2];
                int      twoPrevPara  = GetQuoteParagraph(twoPrevQuote);
                //default to first in quote that begins n-2
                for (int i = index - 3; i >= 0; i--)
                {
                    if (GetQuoteParagraph(quotes[i]) == twoPrevPara)
                    {
                        twoPrevQuote = quotes[i];
                    }
                    else
                    {
                        break;
                    }
                }
                int      tokenBeginIdx          = currQuote.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                int      tokenEndIdx            = currQuote.Get(typeof(CoreAnnotations.TokenEndAnnotation));
                ICoreMap currQuoteBeginSentence = sentences[currQuote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))];
                bool     isAloneInParagraph     = true;
                if (tokenBeginIdx > 0)
                {
                    CoreLabel prevToken    = tokens[tokenBeginIdx - 1];
                    ICoreMap  prevSentence = sentences[prevToken.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))];
                    if (prevSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)).Equals(currQuoteBeginSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation))))
                    {
                        isAloneInParagraph = false;
                    }
                }
                if (tokenEndIdx < tokens.Count - 1)
                {
                    // if the next token is *NL*, it won't be in a sentence (if newlines have been tokenized)
                    // so advance to the next non *NL* toke
                    CoreLabel currToken = tokens[tokenEndIdx + 1];
                    while (currToken.IsNewline() && tokenEndIdx + 1 < tokens.Count - 1)
                    {
                        tokenEndIdx++;
                        currToken = tokens[tokenEndIdx + 1];
                    }
                    if (!currToken.IsNewline())
                    {
                        ICoreMap nextSentence = sentences[currToken.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))];
                        if (nextSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)).Equals(currQuoteBeginSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation))))
                        {
                            isAloneInParagraph = false;
                        }
                    }
                }
                if (twoPrevQuote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) == null || !isAloneInParagraph || currQuote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null || twoPrevQuote.Get(typeof(QuoteAttributionAnnotator.MentionTypeAnnotation
                                                                                                                                                                                                                                  )).Equals(Sieve.Pronoun))
                {
                    continue;
                }
                if (GetQuoteParagraph(currQuote) == GetQuoteParagraph(prevQuote) + 1 && GetQuoteParagraph(prevQuote) == GetQuoteParagraph(twoPrevQuote) + 1)
                {
                    FillInMention(currQuote, GetMentionData(twoPrevQuote), sieveName);
                }
            }
        }
Exemple #33
0
 /// <summary>
 /// Create a new <code>TreeGraphNode</code> having the same tree structure 
 /// and label values as an existing tree (but no shared storage).
 /// Operates recursively to construct an entire subtree
 /// </summary>
 /// <param name="t">the tree to copy</param>
 /// <param name="parent">the parent node</param>
 protected TreeGraphNode(Tree t, TreeGraphNode parent)
 {
     this._parent = parent;
     Tree[] tKids = t.Children();
     int numKids = tKids.Length;
     _children = new TreeGraphNode[numKids];
     for (int i = 0; i < numKids; i++)
     {
         _children[i] = new TreeGraphNode(tKids[i], this);
         if (t.IsPreTerminal())
         {
             // add the tags to the leaves
             _children[i]._label.SetTag(t.Label().Value());
         }
     }
     this._label = (CoreLabel) Mlf.NewLabel(t.Label());
 }
Exemple #34
0
 /// <summary>
 /// Sets the label associated with the current node
 /// </summary>
 public void SetLabel(CoreLabel label)
 {
     this._label = label;
 }
Exemple #35
0
 public string ToString(CoreLabel.OutputFormat format)
 {
     return _label.ToString(format);
 }
Exemple #36
0
 /// <summary>
 /// Create a new <code>TreeGraphNode</code> with the supplied label
 /// </summary>
 /// <param name="label">the label for this node</param>
 public TreeGraphNode(ILabel label)
 {
     this._label = (CoreLabel) Mlf.NewLabel(label);
 }