Ejemplo n.º 1
0
        public ModCollinsHeadFinder(AbstractTreebankLanguagePack tlp) : base(tlp, tlp.PunctuationTags())
        {
            // avoid punctuation as head in readonly default rule

            NonTerminalInfo = new Dictionary<string, string[][]>();

            // This version from Collins' diss (1999: 236-238)
            // NNS, NN is actually sensible (money, etc.)!
            // QP early isn't; should prefer JJR NN RB
            // remove ADVP; it just shouldn't be there.
            // if two JJ, should take right one (e.g. South Korean)
            // NonTerminalInfo.Add(CoordinationTransformer.Adjective, new string[][]{{Left, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.DollarSign, "QP"}, {Right, PartsOfSpeech.Adjective}, {Left, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.VerbGerundOrPresentParticiple, CoordinationTransformer.Adjective, "JJP", PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Noun, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Determiner, PartsOfSpeech.ForeignWord, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, "SBAR", PartsOfSpeech.Adverb}});
            NonTerminalInfo.Add(CoordinationTransformer.Adjective,
                new string[][]
                {
                    new string[] {Left, PartsOfSpeech.DollarSign},
                    new string[]
                    {
                        RightDis, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.Adjective,
                        QP, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.VerbGerundOrPresentParticiple
                    },
                    new string[] {Left, CoordinationTransformer.Adjective},
                    new string[]
                    {
                        RightDis, JJP, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.AdjectiveSuperlative,
                        PartsOfSpeech.Determiner, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative,
                        PartsOfSpeech.CardinalNumber, PartsOfSpeech.PrepositionOrSubordinateConjunction,
                        PartsOfSpeech.VerbPastTense
                    },
                    new string[] {Left, ADVP, CoordinationTransformer.Noun}
                });
            NonTerminalInfo.Add(JJP,
                new string[][]
                {
                    new string[]
                    {
                        Left, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.DollarSign,
                        QP, PartsOfSpeech.Adjective, PartsOfSpeech.VerbPastParticiple,
                        PartsOfSpeech.VerbGerundOrPresentParticiple, CoordinationTransformer.Adjective, JJP, PartsOfSpeech.AdjectiveComparative,
                        CoordinationTransformer.Noun, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Determiner, PartsOfSpeech.ForeignWord,
                        PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, SBAR, PartsOfSpeech.Adverb
                    }
                });
            // JJP is introduced for NML-like adjective phrases in Vadas' treebank; Chris wishes he hadn't used JJP which should be a POS-tag.
            // ADVP rule rewritten by Chris in Nov 2010 to be rightdis.  This is right! JJ.* is often head and rightmost.
            NonTerminalInfo.Add(ADVP, new string[][]
            {
                new string[] {Left, ADVP, PartsOfSpeech.PrepositionOrSubordinateConjunction},
                new string[]
                {
                    RightDis, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative,
                    PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.AdjectiveSuperlative
                },
                new string[]
                {
                    RightDis, PartsOfSpeech.Particle, PartsOfSpeech.Determiner, PartsOfSpeech.NounSingularOrMass,
                    PartsOfSpeech.CardinalNumber, CoordinationTransformer.Noun, PartsOfSpeech.VerbPastParticiple,
                    PartsOfSpeech.ProperNounSingular, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.ForeignWord,
                    PartsOfSpeech.NounPlural, CoordinationTransformer.Adjective, NML
                }
            });
            NonTerminalInfo.Add(CONJP,
                new string[][]
                {
                    new string[]
                    {
                        Right, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.Adverb,
                        PartsOfSpeech.PrepositionOrSubordinateConjunction
                    }
                });
            NonTerminalInfo.Add(FRAG, new string[][] {new string[] {Right}}); // crap
            NonTerminalInfo.Add(INTJ, new string[][] {new string[] {Left}});
            NonTerminalInfo.Add(LST,
                new string[][] {new string[] {Right, PartsOfSpeech.ListItemMarker, PartsOfSpeech.ColonSemiColon}});

            // NML is head in: (NAC-LOC (NML San Antonio) (, ,) (NNP Texas))
            // TODO: NNP should be head (rare cases, could be ignored):
            //   (NAC (NML New York) (NNP Court) (PP of Appeals))
            //   (NAC (NML Prudential Insurance) (NNP Co.) (PP Of America))
            // Chris: This could maybe still do with more thought, but NAC is rare.
            NonTerminalInfo.Add(NAC,
                new string[][]
                {
                    new string[]
                    {
                        Left, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, NML,
                        PartsOfSpeech.ProperNounSingular, PartsOfSpeech.ProperNounPlural, CoordinationTransformer.Noun, NAC,
                        PartsOfSpeech.ExistentialThere, PartsOfSpeech.DollarSign, PartsOfSpeech.CardinalNumber, QP,
                        PartsOfSpeech.PersonalPronoun, PartsOfSpeech.VerbGerundOrPresentParticiple,
                        PartsOfSpeech.Adjective,
                        PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Adjective, JJP,
                        PartsOfSpeech.ForeignWord
                    }
                });

            // Added JJ to PP head table, since it is a head in several cases, e.g.:
            // (PP (JJ next) (PP to them))
            // When you have both JJ and IN daughters, it is invariably "such as" -- not so clear which should be head, but leave as IN
            // should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite)))  Michel thinks we should make JJ a head of PP
            // added SYM as used in new treebanks for symbols filling role of IN
            // Changed PP search to left -- just what you want for conjunction (and consistent with SemanticHeadFinder)
            NonTerminalInfo.Add(PP,
                new string[][]
                {
                    new string[]
                    {
                        Right, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.To,
                        PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbPastParticiple,
                        PartsOfSpeech.Particle, PartsOfSpeech.ForeignWord, PartsOfSpeech.Adjective, PartsOfSpeech.Symbol
                    },
                    new string[] {Left, PP}
                });

            NonTerminalInfo.Add(PRN,
                new string[][]
                {
                    new string[]
                    {
                        Left, VP, CoordinationTransformer.Noun, PP, SQ, S, SINV, SBAR, CoordinationTransformer.Adjective, JJP, ADVP, INTJ, WHNP,
                        NAC,
                        PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.Adjective,
                        PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.ProperNounSingular
                    }
                });
            NonTerminalInfo.Add(PRT, new string[][] {new string[] {Right, PartsOfSpeech.Particle}});
            // add '#' for pounds!!
            NonTerminalInfo.Add(QP,
                new string[][]
                {
                    new string[]
                    {
                        Left, PartsOfSpeech.DollarSign, PartsOfSpeech.PrepositionOrSubordinateConjunction,
                        PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.Adjective,
                        PartsOfSpeech.CardinalNumber, PartsOfSpeech.Predeterminer, PartsOfSpeech.Determiner,
                        PartsOfSpeech.Adverb, NCD, QP, PartsOfSpeech.AdjectiveComparative,
                        PartsOfSpeech.AdjectiveSuperlative
                    }
                });
            // reduced relative clause can be any predicate VP, ADJP, NP, PP.
            // For choosing between NP and PP, really need to know which one is temporal and to choose the other.
            // It's not clear ADVP needs to be in the list at all (delete?).
            NonTerminalInfo.Add(RRC,
                new string[][] { new string[] { Left, RRC }, new string[] { Right, VP, CoordinationTransformer.Adjective, JJP, CoordinationTransformer.Noun, PP, ADVP } });

            // delete IN -- go for main part of sentence; add FRAG

            NonTerminalInfo.Add(S,
                new string[][] { new string[] { Left, PartsOfSpeech.To, VP, S, FRAG, SBAR, CoordinationTransformer.Adjective, JJP, UCP, CoordinationTransformer.Noun } });
            NonTerminalInfo.Add(SBAR,
                new string[][]
                {
                    new string[]
                    {
                        Left, WHNP, WHPP, WHADVP, WHADJP, PartsOfSpeech.PrepositionOrSubordinateConjunction,
                        PartsOfSpeech.Determiner, S, SQ, SINV, SBAR, FRAG
                    }
                });
            NonTerminalInfo.Add(SBARQ,
                new string[][] {new string[] {Left, SQ, S, SINV, SBARQ, FRAG, SBAR}});
            // cdm: if you have 2 VP under an SINV, you should really take the 2nd as syntactic head, because the first is a topicalized VP complement of the second, but for now I didn't change this, since it didn't help parsing.  (If it were changed, it'd need to be also changed to the opposite in SemanticHeadFinder.)
            NonTerminalInfo.Add(SINV,
                new string[][]
                {
                    new string[]
                    {
                        Left, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbPastTense,
                        PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.Modal,
                        PartsOfSpeech.VerbPastParticiple, VP, S, SINV, CoordinationTransformer.Adjective, JJP, CoordinationTransformer.Noun
                    }
                });
            NonTerminalInfo.Add(SQ,
                new string[][]
                {
                    new string[]
                    {
                        Left, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbPastTense,
                        PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.Modal, AUX,
                        AUXG, VP, SQ
                    }
                });
                // TODO: Should maybe put S before SQ for tag questions. Check.
            NonTerminalInfo.Add(UCP, new string[][] {new string[] {Right}});
            // below is weird!! Make 2 lists, one for good and one for bad heads??
            // VP: added AUX and AUXG to work with Charniak tags
            NonTerminalInfo.Add(VP,
                new string[][]
                {
                    new string[]
                    {
                        Left, PartsOfSpeech.To, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple,
                        PartsOfSpeech.Modal, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm,
                        PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, VP,
                        AUX, AUXG, CoordinationTransformer.Adjective, JJP,
                        PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, PartsOfSpeech.Adjective, CoordinationTransformer.Noun,
                        PartsOfSpeech.ProperNounSingular
                    }
                });
            NonTerminalInfo.Add(WHADJP,
                new string[][]
                {
                    new string[]
                    {
                        Left, PartsOfSpeech.WhAdverb, WHADVP, PartsOfSpeech.Adverb, PartsOfSpeech.Adjective, CoordinationTransformer.Adjective,
                        JJP, PartsOfSpeech.AdjectiveComparative
                    }
                });
            NonTerminalInfo.Add(WHADVP, new string[][] {new string[] {Right, PartsOfSpeech.WhAdverb, WHADVP}});
            NonTerminalInfo.Add(WHNP,
                new string[][]
                {
                    new string[]
                    {
                        Left, PartsOfSpeech.WhDeterminer, PartsOfSpeech.WhPronoun, PartsOfSpeech.PossessiveWhPronoun,
                        WHADJP, WHPP, WHNP
                    }
                });
            NonTerminalInfo.Add(WHPP,
                new string[][]
                {
                    new string[]
                    {
                        Right, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.To,
                        PartsOfSpeech.ForeignWord
                    }
                });
            NonTerminalInfo.Add(X,
                new string[][] { new string[] { Right, S, VP, CoordinationTransformer.Adjective, JJP, CoordinationTransformer.Noun, SBAR, PP, X } });
            NonTerminalInfo.Add(CoordinationTransformer.Noun,
                new string[][]
                {
                    new string[]
                    {
                        RightDis, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.ProperNounSingular,
                        PartsOfSpeech.ProperNounPlural, PartsOfSpeech.NounPlural, NML, NX,
                        PartsOfSpeech.PossessiveEnding, PartsOfSpeech.AdjectiveComparative
                    },
                    new string[] {Left, CoordinationTransformer.Noun, PartsOfSpeech.PersonalPronoun},
                    new string[] {RightDis, PartsOfSpeech.DollarSign, CoordinationTransformer.Adjective, JJP, PRN, PartsOfSpeech.ForeignWord},
                    new string[] {Right, PartsOfSpeech.CardinalNumber},
                    new string[]
                    {
                        RightDis, PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Adverb, QP,
                        PartsOfSpeech.Determiner, PartsOfSpeech.WhDeterminer, PartsOfSpeech.AdverbComparative, ADVP
                    }
                });
            NonTerminalInfo.Add(NX, NonTerminalInfo[CoordinationTransformer.Noun]);
            // TODO: seems JJ should be head of NML in this case:
            // (NP (NML (JJ former) (NML Red Sox) (JJ great)) (NNP Luis) (NNP Tiant)),
            // (although JJ great is tagged wrong)
            NonTerminalInfo.Add(NML, NonTerminalInfo[CoordinationTransformer.Noun]);


            NonTerminalInfo.Add(POSSP, new string[][] {new string[] {Right, PartsOfSpeech.PossessiveEnding}});

            /* HJT: Adding the following to deal with oddly formed data in (for example) the Brown corpus */
            NonTerminalInfo.Add(ROOT, new string[][] {new string[] {Left, S, SQ, SINV, SBAR, FRAG}});
            // Just to handle trees which have TOP instead of ROOT at the root
            NonTerminalInfo.Add(TOP, NonTerminalInfo[ROOT]);
            NonTerminalInfo.Add(TYPO, new string[][]
            {
                new string[]
                {
                    Left, PartsOfSpeech.NounSingularOrMass, CoordinationTransformer.Noun, NML, PartsOfSpeech.ProperNounSingular,
                    PartsOfSpeech.ProperNounPlural, PartsOfSpeech.To,
                    PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Modal,
                    PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm,
                    PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, VP, CoordinationTransformer.Adjective,
                    JJP, FRAG
                }
            }); // for Brown (Roger)
            NonTerminalInfo.Add(ADV, new string[][]
            {
                new string[]
                {
                    Right, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative,
                    PartsOfSpeech.ForeignWord,
                    ADVP, PartsOfSpeech.To, PartsOfSpeech.CardinalNumber, PartsOfSpeech.AdjectiveComparative,
                    PartsOfSpeech.Adjective, PartsOfSpeech.PrepositionOrSubordinateConjunction, CoordinationTransformer.Noun, "NML",
                    PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.NounSingularOrMass
                }
            });

            // SWBD
            NonTerminalInfo.Add(EDITED, new string[][] {new string[] {Left}});
                // crap rule for Switchboard (if don't delete EDITED nodes)
            // in sw2756, a PartsOfSpeech.VerbBaseForm. (copy "VP" to handle this problem, though should really fix it on reading)
            NonTerminalInfo.Add(PartsOfSpeech.VerbBaseForm,
                new string[][]
                {
                    new string[]
                    {
                        Left, PartsOfSpeech.To, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple,
                        PartsOfSpeech.Modal, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm,
                        PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, VP,
                        AUX, AUXG, CoordinationTransformer.Adjective, JJP,
                        PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, PartsOfSpeech.Adjective, CoordinationTransformer.Noun,
                        PartsOfSpeech.ProperNounSingular
                    }
                });

            NonTerminalInfo.Add(META, new string[][] {new string[] {Left}});
                // rule for OntoNotes, but maybe should just be deleted in TreeReader??
            NonTerminalInfo.Add(XS, new string[][] {new string[] {Right, PartsOfSpeech.PrepositionOrSubordinateConjunction}});
                // rule for new structure in QP, introduced by Stanford in QPTreeTransformer
            // NonTerminalInfo.Add(null, new string[][] {{Left}});  // rule for OntoNotes from Michel, but it would be better to fix this in TreeReader or to use a default rule?

            // todo: Uncomment this line if we always want to take the leftmost if no head rule is defined for the mother category.
            // defaultRule = defaultLeftRule; // Don't exception, take leftmost if no rule defined for a certain parent category
        }
Ejemplo n.º 2
0
        public ModCollinsHeadFinder(AbstractTreebankLanguagePack tlp) : base(tlp, tlp.PunctuationTags())
        {
            // avoid punctuation as head in readonly default rule

            NonTerminalInfo = new Dictionary <string, string[][]>();

            // This version from Collins' diss (1999: 236-238)
            // NNS, NN is actually sensible (money, etc.)!
            // QP early isn't; should prefer JJR NN RB
            // remove ADVP; it just shouldn't be there.
            // if two JJ, should take right one (e.g. South Korean)
            // NonTerminalInfo.Add(CoordinationTransformer.Adjective, new string[][]{{Left, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.DollarSign, "QP"}, {Right, PartsOfSpeech.Adjective}, {Left, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.VerbGerundOrPresentParticiple, CoordinationTransformer.Adjective, "JJP", PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Noun, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Determiner, PartsOfSpeech.ForeignWord, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, "SBAR", PartsOfSpeech.Adverb}});
            NonTerminalInfo.Add(CoordinationTransformer.Adjective,
                                new string[][]
            {
                new string[] { Left, PartsOfSpeech.DollarSign },
                new string[]
                {
                    RightDis, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.Adjective,
                    QP, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.VerbGerundOrPresentParticiple
                },
                new string[] { Left, CoordinationTransformer.Adjective },
                new string[]
                {
                    RightDis, JJP, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.AdjectiveSuperlative,
                    PartsOfSpeech.Determiner, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative,
                    PartsOfSpeech.CardinalNumber, PartsOfSpeech.PrepositionOrSubordinateConjunction,
                    PartsOfSpeech.VerbPastTense
                },
                new string[] { Left, ADVP, CoordinationTransformer.Noun }
            });
            NonTerminalInfo.Add(JJP,
                                new string[][]
            {
                new string[]
                {
                    Left, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.DollarSign,
                    QP, PartsOfSpeech.Adjective, PartsOfSpeech.VerbPastParticiple,
                    PartsOfSpeech.VerbGerundOrPresentParticiple, CoordinationTransformer.Adjective, JJP, PartsOfSpeech.AdjectiveComparative,
                    CoordinationTransformer.Noun, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Determiner, PartsOfSpeech.ForeignWord,
                    PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, SBAR, PartsOfSpeech.Adverb
                }
            });
            // JJP is introduced for NML-like adjective phrases in Vadas' treebank; Chris wishes he hadn't used JJP which should be a POS-tag.
            // ADVP rule rewritten by Chris in Nov 2010 to be rightdis.  This is right! JJ.* is often head and rightmost.
            NonTerminalInfo.Add(ADVP, new string[][]
            {
                new string[] { Left, ADVP, PartsOfSpeech.PrepositionOrSubordinateConjunction },
                new string[]
                {
                    RightDis, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative,
                    PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.AdjectiveSuperlative
                },
                new string[]
                {
                    RightDis, PartsOfSpeech.Particle, PartsOfSpeech.Determiner, PartsOfSpeech.NounSingularOrMass,
                    PartsOfSpeech.CardinalNumber, CoordinationTransformer.Noun, PartsOfSpeech.VerbPastParticiple,
                    PartsOfSpeech.ProperNounSingular, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.ForeignWord,
                    PartsOfSpeech.NounPlural, CoordinationTransformer.Adjective, NML
                }
            });
            NonTerminalInfo.Add(CONJP,
                                new string[][]
            {
                new string[]
                {
                    Right, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.Adverb,
                    PartsOfSpeech.PrepositionOrSubordinateConjunction
                }
            });
            NonTerminalInfo.Add(FRAG, new string[][] { new string[] { Right } }); // crap
            NonTerminalInfo.Add(INTJ, new string[][] { new string[] { Left } });
            NonTerminalInfo.Add(LST,
                                new string[][] { new string[] { Right, PartsOfSpeech.ListItemMarker, PartsOfSpeech.ColonSemiColon } });

            // NML is head in: (NAC-LOC (NML San Antonio) (, ,) (NNP Texas))
            // TODO: NNP should be head (rare cases, could be ignored):
            //   (NAC (NML New York) (NNP Court) (PP of Appeals))
            //   (NAC (NML Prudential Insurance) (NNP Co.) (PP Of America))
            // Chris: This could maybe still do with more thought, but NAC is rare.
            NonTerminalInfo.Add(NAC,
                                new string[][]
            {
                new string[]
                {
                    Left, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, NML,
                    PartsOfSpeech.ProperNounSingular, PartsOfSpeech.ProperNounPlural, CoordinationTransformer.Noun, NAC,
                    PartsOfSpeech.ExistentialThere, PartsOfSpeech.DollarSign, PartsOfSpeech.CardinalNumber, QP,
                    PartsOfSpeech.PersonalPronoun, PartsOfSpeech.VerbGerundOrPresentParticiple,
                    PartsOfSpeech.Adjective,
                    PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Adjective, JJP,
                    PartsOfSpeech.ForeignWord
                }
            });

            // Added JJ to PP head table, since it is a head in several cases, e.g.:
            // (PP (JJ next) (PP to them))
            // When you have both JJ and IN daughters, it is invariably "such as" -- not so clear which should be head, but leave as IN
            // should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite)))  Michel thinks we should make JJ a head of PP
            // added SYM as used in new treebanks for symbols filling role of IN
            // Changed PP search to left -- just what you want for conjunction (and consistent with SemanticHeadFinder)
            NonTerminalInfo.Add(PP,
                                new string[][]
            {
                new string[]
                {
                    Right, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.To,
                    PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbPastParticiple,
                    PartsOfSpeech.Particle, PartsOfSpeech.ForeignWord, PartsOfSpeech.Adjective, PartsOfSpeech.Symbol
                },
                new string[] { Left, PP }
            });

            NonTerminalInfo.Add(PRN,
                                new string[][]
            {
                new string[]
                {
                    Left, VP, CoordinationTransformer.Noun, PP, SQ, S, SINV, SBAR, CoordinationTransformer.Adjective, JJP, ADVP, INTJ, WHNP,
                    NAC,
                    PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.Adjective,
                    PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.ProperNounSingular
                }
            });
            NonTerminalInfo.Add(PRT, new string[][] { new string[] { Right, PartsOfSpeech.Particle } });
            // add '#' for pounds!!
            NonTerminalInfo.Add(QP,
                                new string[][]
            {
                new string[]
                {
                    Left, PartsOfSpeech.DollarSign, PartsOfSpeech.PrepositionOrSubordinateConjunction,
                    PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.Adjective,
                    PartsOfSpeech.CardinalNumber, PartsOfSpeech.Predeterminer, PartsOfSpeech.Determiner,
                    PartsOfSpeech.Adverb, NCD, QP, PartsOfSpeech.AdjectiveComparative,
                    PartsOfSpeech.AdjectiveSuperlative
                }
            });
            // reduced relative clause can be any predicate VP, ADJP, NP, PP.
            // For choosing between NP and PP, really need to know which one is temporal and to choose the other.
            // It's not clear ADVP needs to be in the list at all (delete?).
            NonTerminalInfo.Add(RRC,
                                new string[][] { new string[] { Left, RRC }, new string[] { Right, VP, CoordinationTransformer.Adjective, JJP, CoordinationTransformer.Noun, PP, ADVP } });

            // delete IN -- go for main part of sentence; add FRAG

            NonTerminalInfo.Add(S,
                                new string[][] { new string[] { Left, PartsOfSpeech.To, VP, S, FRAG, SBAR, CoordinationTransformer.Adjective, JJP, UCP, CoordinationTransformer.Noun } });
            NonTerminalInfo.Add(SBAR,
                                new string[][]
            {
                new string[]
                {
                    Left, WHNP, WHPP, WHADVP, WHADJP, PartsOfSpeech.PrepositionOrSubordinateConjunction,
                    PartsOfSpeech.Determiner, S, SQ, SINV, SBAR, FRAG
                }
            });
            NonTerminalInfo.Add(SBARQ,
                                new string[][] { new string[] { Left, SQ, S, SINV, SBARQ, FRAG, SBAR } });
            // cdm: if you have 2 VP under an SINV, you should really take the 2nd as syntactic head, because the first is a topicalized VP complement of the second, but for now I didn't change this, since it didn't help parsing.  (If it were changed, it'd need to be also changed to the opposite in SemanticHeadFinder.)
            NonTerminalInfo.Add(SINV,
                                new string[][]
            {
                new string[]
                {
                    Left, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbPastTense,
                    PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.Modal,
                    PartsOfSpeech.VerbPastParticiple, VP, S, SINV, CoordinationTransformer.Adjective, JJP, CoordinationTransformer.Noun
                }
            });
            NonTerminalInfo.Add(SQ,
                                new string[][]
            {
                new string[]
                {
                    Left, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbPastTense,
                    PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.Modal, AUX,
                    AUXG, VP, SQ
                }
            });
            // TODO: Should maybe put S before SQ for tag questions. Check.
            NonTerminalInfo.Add(UCP, new string[][] { new string[] { Right } });
            // below is weird!! Make 2 lists, one for good and one for bad heads??
            // VP: added AUX and AUXG to work with Charniak tags
            NonTerminalInfo.Add(VP,
                                new string[][]
            {
                new string[]
                {
                    Left, PartsOfSpeech.To, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple,
                    PartsOfSpeech.Modal, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm,
                    PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, VP,
                    AUX, AUXG, CoordinationTransformer.Adjective, JJP,
                    PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, PartsOfSpeech.Adjective, CoordinationTransformer.Noun,
                    PartsOfSpeech.ProperNounSingular
                }
            });
            NonTerminalInfo.Add(WHADJP,
                                new string[][]
            {
                new string[]
                {
                    Left, PartsOfSpeech.WhAdverb, WHADVP, PartsOfSpeech.Adverb, PartsOfSpeech.Adjective, CoordinationTransformer.Adjective,
                    JJP, PartsOfSpeech.AdjectiveComparative
                }
            });
            NonTerminalInfo.Add(WHADVP, new string[][] { new string[] { Right, PartsOfSpeech.WhAdverb, WHADVP } });
            NonTerminalInfo.Add(WHNP,
                                new string[][]
            {
                new string[]
                {
                    Left, PartsOfSpeech.WhDeterminer, PartsOfSpeech.WhPronoun, PartsOfSpeech.PossessiveWhPronoun,
                    WHADJP, WHPP, WHNP
                }
            });
            NonTerminalInfo.Add(WHPP,
                                new string[][]
            {
                new string[]
                {
                    Right, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.To,
                    PartsOfSpeech.ForeignWord
                }
            });
            NonTerminalInfo.Add(X,
                                new string[][] { new string[] { Right, S, VP, CoordinationTransformer.Adjective, JJP, CoordinationTransformer.Noun, SBAR, PP, X } });
            NonTerminalInfo.Add(CoordinationTransformer.Noun,
                                new string[][]
            {
                new string[]
                {
                    RightDis, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.ProperNounSingular,
                    PartsOfSpeech.ProperNounPlural, PartsOfSpeech.NounPlural, NML, NX,
                    PartsOfSpeech.PossessiveEnding, PartsOfSpeech.AdjectiveComparative
                },
                new string[] { Left, CoordinationTransformer.Noun, PartsOfSpeech.PersonalPronoun },
                new string[] { RightDis, PartsOfSpeech.DollarSign, CoordinationTransformer.Adjective, JJP, PRN, PartsOfSpeech.ForeignWord },
                new string[] { Right, PartsOfSpeech.CardinalNumber },
                new string[]
                {
                    RightDis, PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Adverb, QP,
                    PartsOfSpeech.Determiner, PartsOfSpeech.WhDeterminer, PartsOfSpeech.AdverbComparative, ADVP
                }
            });
            NonTerminalInfo.Add(NX, NonTerminalInfo[CoordinationTransformer.Noun]);
            // TODO: seems JJ should be head of NML in this case:
            // (NP (NML (JJ former) (NML Red Sox) (JJ great)) (NNP Luis) (NNP Tiant)),
            // (although JJ great is tagged wrong)
            NonTerminalInfo.Add(NML, NonTerminalInfo[CoordinationTransformer.Noun]);


            NonTerminalInfo.Add(POSSP, new string[][] { new string[] { Right, PartsOfSpeech.PossessiveEnding } });

            /* HJT: Adding the following to deal with oddly formed data in (for example) the Brown corpus */
            NonTerminalInfo.Add(ROOT, new string[][] { new string[] { Left, S, SQ, SINV, SBAR, FRAG } });
            // Just to handle trees which have TOP instead of ROOT at the root
            NonTerminalInfo.Add(TOP, NonTerminalInfo[ROOT]);
            NonTerminalInfo.Add(TYPO, new string[][]
            {
                new string[]
                {
                    Left, PartsOfSpeech.NounSingularOrMass, CoordinationTransformer.Noun, NML, PartsOfSpeech.ProperNounSingular,
                    PartsOfSpeech.ProperNounPlural, PartsOfSpeech.To,
                    PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Modal,
                    PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm,
                    PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, VP, CoordinationTransformer.Adjective,
                    JJP, FRAG
                }
            }); // for Brown (Roger)
            NonTerminalInfo.Add(ADV, new string[][]
            {
                new string[]
                {
                    Right, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative,
                    PartsOfSpeech.ForeignWord,
                    ADVP, PartsOfSpeech.To, PartsOfSpeech.CardinalNumber, PartsOfSpeech.AdjectiveComparative,
                    PartsOfSpeech.Adjective, PartsOfSpeech.PrepositionOrSubordinateConjunction, CoordinationTransformer.Noun, "NML",
                    PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.NounSingularOrMass
                }
            });

            // SWBD
            NonTerminalInfo.Add(EDITED, new string[][] { new string[] { Left } });
            // crap rule for Switchboard (if don't delete EDITED nodes)
            // in sw2756, a PartsOfSpeech.VerbBaseForm. (copy "VP" to handle this problem, though should really fix it on reading)
            NonTerminalInfo.Add(PartsOfSpeech.VerbBaseForm,
                                new string[][]
            {
                new string[]
                {
                    Left, PartsOfSpeech.To, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple,
                    PartsOfSpeech.Modal, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm,
                    PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, VP,
                    AUX, AUXG, CoordinationTransformer.Adjective, JJP,
                    PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, PartsOfSpeech.Adjective, CoordinationTransformer.Noun,
                    PartsOfSpeech.ProperNounSingular
                }
            });

            NonTerminalInfo.Add(META, new string[][] { new string[] { Left } });
            // rule for OntoNotes, but maybe should just be deleted in TreeReader??
            NonTerminalInfo.Add(XS, new string[][] { new string[] { Right, PartsOfSpeech.PrepositionOrSubordinateConjunction } });
            // rule for new structure in QP, introduced by Stanford in QPTreeTransformer
            // NonTerminalInfo.Add(null, new string[][] {{Left}});  // rule for OntoNotes from Michel, but it would be better to fix this in TreeReader or to use a default rule?

            // todo: Uncomment this line if we always want to take the leftmost if no head rule is defined for the mother category.
            // defaultRule = defaultLeftRule; // Don't exception, take leftmost if no rule defined for a certain parent category
        }
Ejemplo n.º 3
0
        public ModCollinsHeadFinder(AbstractTreebankLanguagePack tlp) : base(tlp, tlp.PunctuationTags())
        {
            // avoid punctuation as head in readonly default rule

            nonTerminalInfo = new Dictionary <string, string[][]>();

            // This version from Collins' diss (1999: 236-238)
            // NNS, NN is actually sensible (money, etc.)!
            // QP early isn't; should prefer JJR NN RB
            // remove ADVP; it just shouldn't be there.
            // if two JJ, should take right one (e.g. South Korean)
            // nonTerminalInfo.Add("ADJP", new string[][]{{"left", "NNS", "NN", "$", "QP"}, {"right", "JJ"}, {"left", "VBN", "VBG", "ADJP", "JJP", "JJR", "NP", "JJS", "DT", "FW", "RBR", "RBS", "SBAR", "RB"}});
            nonTerminalInfo.Add("ADJP",
                                new string[][]
            {
                new string[] { "left", "$" }, new string[] { "rightdis", "NNS", "NN", "JJ", "QP", "VBN", "VBG" },
                new string[] { "left", "ADJP" },
                new string[] { "rightdis", "JJP", "JJR", "JJS", "DT", "RB", "RBR", "CD", "IN", "VBD" },
                new string[] { "left", "ADVP", "NP" }
            });
            nonTerminalInfo.Add("JJP",
                                new string[][]
            {
                new string[]
                {
                    "left", "NNS", "NN", "$", "QP", "JJ", "VBN", "VBG", "ADJP", "JJP", "JJR", "NP", "JJS", "DT", "FW",
                    "RBR", "RBS", "SBAR", "RB"
                }
            });
            // JJP is introduced for NML-like adjective phrases in Vadas' treebank; Chris wishes he hadn't used JJP which should be a POS-tag.
            // ADVP rule rewritten by Chris in Nov 2010 to be rightdis.  This is right! JJ.* is often head and rightmost.
            nonTerminalInfo.Add("ADVP", new string[][]
            {
                new string[] { "left", "ADVP", "IN" },
                new string[] { "rightdis", "RB", "RBR", "RBS", "JJ", "JJR", "JJS" },
                new string[] { "rightdis", "RP", "DT", "NN", "CD", "NP", "VBN", "NNP", "CC", "FW", "NNS", "ADJP", "NML" }
            });
            nonTerminalInfo.Add("CONJP", new string[][] { new string[] { "right", "CC", "RB", "IN" } });
            nonTerminalInfo.Add("FRAG", new string[][] { new string[] { "right" } }); // crap
            nonTerminalInfo.Add("INTJ", new string[][] { new string[] { "left" } });
            nonTerminalInfo.Add("LST", new string[][] { new string[] { "right", "LS", ":" } });

            // NML is head in: (NAC-LOC (NML San Antonio) (, ,) (NNP Texas))
            // TODO: NNP should be head (rare cases, could be ignored):
            //   (NAC (NML New York) (NNP Court) (PP of Appeals))
            //   (NAC (NML Prudential Insurance) (NNP Co.) (PP Of America))
            // Chris: This could maybe still do with more thought, but NAC is rare.
            nonTerminalInfo.Add("NAC",
                                new string[][]
            {
                new string[]
                {
                    "left", "NN", "NNS", "NML", "NNP", "NNPS", "NP", "NAC", "EX", "$", "CD", "QP", "PRP", "VBG", "JJ",
                    "JJS", "JJR", "ADJP", "JJP", "FW"
                }
            });

            // Added JJ to PP head table, since it is a head in several cases, e.g.:
            // (PP (JJ next) (PP to them))
            // When you have both JJ and IN daughters, it is invariably "such as" -- not so clear which should be head, but leave as IN
            // should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite)))  Michel thinks we should make JJ a head of PP
            // added SYM as used in new treebanks for symbols filling role of IN
            // Changed PP search to left -- just what you want for conjunction (and consistent with SemanticHeadFinder)
            nonTerminalInfo.Add("PP",
                                new string[][]
                                { new string[] { "right", "IN", "TO", "VBG", "VBN", "RP", "FW", "JJ", "SYM" }, new string[] { "left", "PP" } });

            nonTerminalInfo.Add("PRN",
                                new string[][]
            {
                new string[]
                {
                    "left", "VP", "NP", "PP", "SQ", "S", "SINV", "SBAR", "ADJP", "JJP", "ADVP", "INTJ", "WHNP", "NAC",
                    "VBP", "JJ", "NN", "NNP"
                }
            });
            nonTerminalInfo.Add("PRT", new string[][] { new string[] { "right", "RP" } });
            // add '#' for pounds!!
            nonTerminalInfo.Add("QP",
                                new string[][]
            {
                new string[] { "left", "$", "IN", "NNS", "NN", "JJ", "CD", "PDT", "DT", "RB", "NCD", "QP", "JJR", "JJS" }
            });
            // reduced relative clause can be any predicate VP, ADJP, NP, PP.
            // For choosing between NP and PP, really need to know which one is temporal and to choose the other.
            // It's not clear ADVP needs to be in the list at all (delete?).
            nonTerminalInfo.Add("RRC",
                                new string[][]
                                { new string[] { "left", "RRC" }, new string[] { "right", "VP", "ADJP", "JJP", "NP", "PP", "ADVP" } });

            // delete IN -- go for main part of sentence; add FRAG

            nonTerminalInfo.Add("S",
                                new string[][] { new string[] { "left", "TO", "VP", "S", "FRAG", "SBAR", "ADJP", "JJP", "UCP", "NP" } });
            nonTerminalInfo.Add("SBAR",
                                new string[][]
            {
                new string[]
                { "left", "WHNP", "WHPP", "WHADVP", "WHADJP", "IN", "DT", "S", "SQ", "SINV", "SBAR", "FRAG" }
            });
            nonTerminalInfo.Add("SBARQ",
                                new string[][] { new string[] { "left", "SQ", "S", "SINV", "SBARQ", "FRAG", "SBAR" } });
            // cdm: if you have 2 VP under an SINV, you should really take the 2nd as syntactic head, because the first is a topicalized VP complement of the second, but for now I didn't change this, since it didn't help parsing.  (If it were changed, it'd need to be also changed to the opposite in SemanticHeadFinder.)
            nonTerminalInfo.Add("SINV",
                                new string[][]
                                { new string[] { "left", "VBZ", "VBD", "VBP", "VB", "MD", "VBN", "VP", "S", "SINV", "ADJP", "JJP", "NP" } });
            nonTerminalInfo.Add("SQ",
                                new string[][] { new string[] { "left", "VBZ", "VBD", "VBP", "VB", "MD", "AUX", "AUXG", "VP", "SQ" } });
            // TODO: Should maybe put S before SQ for tag questions. Check.
            nonTerminalInfo.Add("UCP", new string[][] { new string[] { "right" } });
            // below is weird!! Make 2 lists, one for good and one for bad heads??
            // VP: added AUX and AUXG to work with Charniak tags
            nonTerminalInfo.Add("VP",
                                new string[][]
            {
                new string[]
                {
                    "left", "TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "VP", "AUX", "AUXG", "ADJP", "JJP",
                    "NN", "NNS", "JJ", "NP", "NNP"
                }
            });
            nonTerminalInfo.Add("WHADJP",
                                new string[][] { new string[] { "left", "WRB", "WHADVP", "RB", "JJ", "ADJP", "JJP", "JJR" } });
            nonTerminalInfo.Add("WHADVP", new string[][] { new string[] { "right", "WRB", "WHADVP" } });
            nonTerminalInfo.Add("WHNP",
                                new string[][] { new string[] { "left", "WDT", "WP", "WP$", "WHADJP", "WHPP", "WHNP" } });
            nonTerminalInfo.Add("WHPP", new string[][] { new string[] { "right", "IN", "TO", "FW" } });
            nonTerminalInfo.Add("X",
                                new string[][] { new string[] { "right", "S", "VP", "ADJP", "JJP", "NP", "SBAR", "PP", "X" } });
            nonTerminalInfo.Add("NP",
                                new string[][]
            {
                new string[] { "rightdis", "NN", "NNP", "NNPS", "NNS", "NML", "NX", "POS", "JJR" },
                new string[] { "left", "NP", "PRP" }, new string[] { "rightdis", "$", "ADJP", "JJP", "PRN", "FW" },
                new string[] { "right", "CD" },
                new string[] { "rightdis", "JJ", "JJS", "RB", "QP", "DT", "WDT", "RBR", "ADVP" }
            });
            nonTerminalInfo.Add("NX", nonTerminalInfo["NP"]);
            // TODO: seems JJ should be head of NML in this case:
            // (NP (NML (JJ former) (NML Red Sox) (JJ great)) (NNP Luis) (NNP Tiant)),
            // (although JJ great is tagged wrong)
            nonTerminalInfo.Add("NML", nonTerminalInfo["NP"]);


            nonTerminalInfo.Add("POSSP", new string[][] { new string[] { "right", "POS" } });

            /* HJT: Adding the following to deal with oddly formed data in (for example) the Brown corpus */
            nonTerminalInfo.Add("ROOT", new string[][] { new string[] { "left", "S", "SQ", "SINV", "SBAR", "FRAG" } });
            // Just to handle trees which have TOP instead of ROOT at the root
            nonTerminalInfo.Add("TOP", nonTerminalInfo["ROOT"]);
            nonTerminalInfo.Add("TYPO", new string[][]
            {
                new string[]
                {
                    "left", "NN", "NP", "NML", "NNP", "NNPS", "TO",
                    "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "VP", "ADJP", "JJP", "FRAG"
                }
            }); // for Brown (Roger)
            nonTerminalInfo.Add("ADV", new string[][]
            {
                new string[]
                {
                    "right", "RB", "RBR", "RBS", "FW",
                    "ADVP", "TO", "CD", "JJR", "JJ", "IN", "NP", "NML", "JJS", "NN"
                }
            });

            // SWBD
            nonTerminalInfo.Add("EDITED", new string[][] { new string[] { "left" } });
            // crap rule for Switchboard (if don't delete EDITED nodes)
            // in sw2756, a "VB". (copy "VP" to handle this problem, though should really fix it on reading)
            nonTerminalInfo.Add("VB",
                                new string[][]
            {
                new string[]
                {
                    "left", "TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "VP", "AUX", "AUXG", "ADJP", "JJP",
                    "NN", "NNS", "JJ", "NP", "NNP"
                }
            });

            nonTerminalInfo.Add("META", new string[][] { new string[] { "left" } });
            // rule for OntoNotes, but maybe should just be deleted in TreeReader??
            nonTerminalInfo.Add("XS", new string[][] { new string[] { "right", "IN" } });
            // rule for new structure in QP, introduced by Stanford in QPTreeTransformer
            // nonTerminalInfo.Add(null, new string[][] {{"left"}});  // rule for OntoNotes from Michel, but it would be better to fix this in TreeReader or to use a default rule?

            // todo: Uncomment this line if we always want to take the leftmost if no head rule is defined for the mother category.
            // defaultRule = defaultLeftRule; // Don't exception, take leftmost if no rule defined for a certain parent category
        }