/// <summary> /// Construct a HeadFinder. /// The TreebankLanguagePack is used to get basic categories. The remaining arguments /// set categories which, if it comes to last resort processing (i.e., none of /// the rules matched), will be avoided as heads. In last resort processing, /// it will attempt to match the leftmost or rightmost constituent not in this /// set but will fall back to the left or rightmost constituent if necessary. /// </summary> /// <param name="tlp">TreebankLanguagePack used to determine basic category</param> /// <param name="categoriesToAvoid">Constituent types to avoid as head</param> protected AbstractCollinsHeadFinder(AbstractTreebankLanguagePack tlp, string[] categoriesToAvoid) { this.Tlp = tlp; // automatically build defaultLeftRule, defaultRightRule DefaultLeftRule = new string[categoriesToAvoid.Length + 1]; DefaultRightRule = new string[categoriesToAvoid.Length + 1]; if (categoriesToAvoid.Length > 0) { DefaultLeftRule[0] = LeftExcept; DefaultRightRule[0] = RightExcept; Array.Copy(categoriesToAvoid, 0, DefaultLeftRule, 1, categoriesToAvoid.Length); Array.Copy(categoriesToAvoid, 0, DefaultRightRule, 1, categoriesToAvoid.Length); } else { DefaultLeftRule[0] = Left; DefaultRightRule[0] = Right; } }
/// <summary> /// Create a SemanticHeadFinder /// </summary> /// <param name="tlp"> /// The TreebankLanguagePack, used by the superclass to get basic category of constituents /// </param> /// <param name="noCopulaHead"> /// If true, a copular verb (be, seem, appear, stay, remain, resemble, become) /// is not treated as head when it has an AdjP or NP complement. If false, /// a copula verb is still always treated as a head. But it will still /// be treated as an auxiliary in periphrastic tenses with a VP complement. /// </param> public SemanticHeadFinder(AbstractTreebankLanguagePack tlp, bool noCopulaHead) : base(tlp) { RuleChanges(); // make a distinction between auxiliaries and copula verbs to // get the NP has semantic head in sentences like "Bill is an honest man". (Added "sha" for "shan't" May 2009 verbalAuxiliaries = new Util.HashSet <string>(Auxiliaries); passiveAuxiliaries = new Util.HashSet <string>(BeGetVerbs); //copula verbs having an NP complement copulars = new Util.HashSet <string>(); if (noCopulaHead) { copulars.AddAll(CopulaVerbs); } // TODO: reverse the polarity of noCopulaHead this.makeCopulaHead = !noCopulaHead; verbalTags = new Util.HashSet <string>(VerbTags); unambiguousAuxiliaryTags = new Util.HashSet <string>(UnambiguousAuxTags); }
/// <summary> /// Create a SemanticHeadFinder /// </summary> /// <param name="tlp"> /// The TreebankLanguagePack, used by the superclass to get basic category of constituents /// </param> /// <param name="noCopulaHead"> /// If true, a copular verb (be, seem, appear, stay, remain, resemble, become) /// is not treated as head when it has an AdjP or NP complement. If false, /// a copula verb is still always treated as a head. But it will still /// be treated as an auxiliary in periphrastic tenses with a VP complement. /// </param> public SemanticHeadFinder(AbstractTreebankLanguagePack tlp, bool noCopulaHead) : base(tlp) { RuleChanges(); // make a distinction between auxiliaries and copula verbs to // get the NP has semantic head in sentences like "Bill is an honest man". (Added "sha" for "shan't" May 2009 verbalAuxiliaries = new Util.HashSet<string>(Auxiliaries); passiveAuxiliaries = new Util.HashSet<string>(BeGetVerbs); //copula verbs having an NP complement copulars = new Util.HashSet<string>(); if (noCopulaHead) { copulars.AddAll(CopulaVerbs); } // TODO: reverse the polarity of noCopulaHead this.makeCopulaHead = !noCopulaHead; verbalTags = new Util.HashSet<string>(VerbTags); unambiguousAuxiliaryTags = new Util.HashSet<string>(UnambiguousAuxTags); }
public DependencyTreeTransformer() { Tlp = new PennTreebankLanguagePack(); }
public CollinsHeadFinder(AbstractTreebankLanguagePack tlp, string[] categoriesToAvoid) : base(tlp, categoriesToAvoid) { nonTerminalInfo = new Dictionary <string, string[][]>(); // This version from Collins' diss (1999: 236-238) nonTerminalInfo.Add("ADJP", new string[][] { new string[] { "left", "NNS", "QP", "NN", "$", "ADVP", "JJ", "VBN", "VBG", "ADJP", "JJR", "NP", "JJS", "DT", "FW", "RBR", "RBS", "SBAR", "RB" } }); nonTerminalInfo.Add("ADVP", new string[][] { new string[] { "right", "RB", "RBR", "RBS", "FW", "ADVP", "TO", "CD", "JJR", "JJ", "IN", "NP", "JJS", "NN" } }); nonTerminalInfo.Add("CONJP", new string[][] { new string[] { "right", "CC", "RB", "IN" } }); nonTerminalInfo.Add("FRAG", new string[][] { new string[] { "right" } }); // crap nonTerminalInfo.Add("INTJ", new string[][] { new string[] { "left" } }); nonTerminalInfo.Add("LST", new string[][] { new string[] { "right", "LS", ":" } }); nonTerminalInfo.Add("NAC", new string[][] { new string[] { "left", "NN", "NNS", "NNP", "NNPS", "NP", "NAC", "EX", "$", "CD", "QP", "PRP", "VBG", "JJ", "JJS", "JJR", "ADJP", "FW" } }); nonTerminalInfo.Add("NX", new string[][] { new string[] { "left" } }); // crap nonTerminalInfo.Add("PP", new string[][] { new string[] { "right", "IN", "TO", "VBG", "VBN", "RP", "FW" } }); // should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite))) nonTerminalInfo.Add("PRN", new string[][] { new string[] { "left" } }); nonTerminalInfo.Add("PRT", new string[][] { new string[] { "right", "RP" } }); nonTerminalInfo.Add("QP", new string[][] { new string[] { "left", "$", "IN", "NNS", "NN", "JJ", "RB", "DT", "CD", "NCD", "QP", "JJR", "JJS" } }); nonTerminalInfo.Add("RRC", new string[][] { new string[] { "right", "VP", "NP", "ADVP", "ADJP", "PP" } }); nonTerminalInfo.Add("S", new string[][] { new string[] { "left", "TO", "IN", "VP", "S", "SBAR", "ADJP", "UCP", "NP" } }); nonTerminalInfo.Add("SBAR", new string[][] { new string[] { "left", "WHNP", "WHPP", "WHADVP", "WHADJP", "IN", "DT", "S", "SQ", "SINV", "SBAR", "FRAG" } }); nonTerminalInfo.Add("SBARQ", new string[][] { new string[] { "left", "SQ", "S", "SINV", "SBARQ", "FRAG" } }); nonTerminalInfo.Add("SINV", new string[][] { new string[] { "left", "VBZ", "VBD", "VBP", "VB", "MD", "VP", "S", "SINV", "ADJP", "NP" } }); nonTerminalInfo.Add("SQ", new string[][] { new string[] { "left", "VBZ", "VBD", "VBP", "VB", "MD", "VP", "SQ" } }); nonTerminalInfo.Add("UCP", new string[][] { new string[] { "right" } }); nonTerminalInfo.Add("VP", new string[][] { new string[] { "left", "TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "AUX", "AUXG", "VP", "ADJP", "NN", "NNS", "NP" } }); nonTerminalInfo.Add("WHADJP", new string[][] { new string[] { "left", "CC", "WRB", "JJ", "ADJP" } }); nonTerminalInfo.Add("WHADVP", new string[][] { new string[] { "right", "CC", "WRB" } }); nonTerminalInfo.Add("WHNP", new string[][] { new string[] { "left", "WDT", "WP", "WP$", "WHADJP", "WHPP", "WHNP" } }); nonTerminalInfo.Add("WHPP", new string[][] { new string[] { "right", "IN", "TO", "FW" } }); nonTerminalInfo.Add("X", new string[][] { new string[] { "right" } }); // crap rule nonTerminalInfo.Add("NP", new string[][] { new string[] { "rightdis", "NN", "NNP", "NNPS", "NNS", "NX", "POS", "JJR" }, new string[] { "left", "NP" }, new string[] { "rightdis", "$", "ADJP", "PRN" }, new string[] { "right", "CD" }, new string[] { "rightdis", "JJ", "JJS", "RB", "QP" } }); nonTerminalInfo.Add("TYPO", new string[][] { new string[] { "left" } }); // another crap rule, for Brown (Roger) nonTerminalInfo.Add("EDITED", new string[][] { new string[] { "left" } }); // crap rule for Switchboard (if don't delete EDITED nodes) nonTerminalInfo.Add("XS", new string[][] { new string[] { "right", "IN" } }); // rule for new structure in QP }
/// <summary> /// This constructor provides the traditional behavior, where there is /// no special avoidance of punctuation categories. /// </summary> /// <param name="tlp">TreebankLanguagePack used for basic category function</param> public CollinsHeadFinder(AbstractTreebankLanguagePack tlp) : this(tlp, EmptyStringArray) { }
public BasicCategoryStringFunction(AbstractTreebankLanguagePack tlp) { this.tlp = tlp; }
public ModCollinsHeadFinder(AbstractTreebankLanguagePack tlp) : base(tlp, tlp.PunctuationTags()) { // avoid punctuation as head in readonly default rule NonTerminalInfo = new Dictionary <string, string[][]>(); // This version from Collins' diss (1999: 236-238) // NNS, NN is actually sensible (money, etc.)! // QP early isn't; should prefer JJR NN RB // remove ADVP; it just shouldn't be there. // if two JJ, should take right one (e.g. South Korean) // NonTerminalInfo.Add(CoordinationTransformer.Adjective, new string[][]{{Left, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.DollarSign, "QP"}, {Right, PartsOfSpeech.Adjective}, {Left, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.VerbGerundOrPresentParticiple, CoordinationTransformer.Adjective, "JJP", PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Noun, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Determiner, PartsOfSpeech.ForeignWord, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, "SBAR", PartsOfSpeech.Adverb}}); NonTerminalInfo.Add(CoordinationTransformer.Adjective, new string[][] { new string[] { Left, PartsOfSpeech.DollarSign }, new string[] { RightDis, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.Adjective, QP, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.VerbGerundOrPresentParticiple }, new string[] { Left, CoordinationTransformer.Adjective }, new string[] { RightDis, JJP, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Determiner, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative, PartsOfSpeech.CardinalNumber, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.VerbPastTense }, new string[] { Left, ADVP, CoordinationTransformer.Noun } }); NonTerminalInfo.Add(JJP, new string[][] { new string[] { Left, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.DollarSign, QP, PartsOfSpeech.Adjective, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.VerbGerundOrPresentParticiple, CoordinationTransformer.Adjective, JJP, PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Noun, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Determiner, PartsOfSpeech.ForeignWord, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, SBAR, PartsOfSpeech.Adverb } }); // JJP is introduced for NML-like adjective phrases in Vadas' treebank; Chris wishes he hadn't used JJP which should be a POS-tag. // ADVP rule rewritten by Chris in Nov 2010 to be rightdis. This is right! JJ.* is often head and rightmost. NonTerminalInfo.Add(ADVP, new string[][] { new string[] { Left, ADVP, PartsOfSpeech.PrepositionOrSubordinateConjunction }, new string[] { RightDis, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.AdjectiveSuperlative }, new string[] { RightDis, PartsOfSpeech.Particle, PartsOfSpeech.Determiner, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.CardinalNumber, CoordinationTransformer.Noun, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.ProperNounSingular, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.ForeignWord, PartsOfSpeech.NounPlural, CoordinationTransformer.Adjective, NML } }); NonTerminalInfo.Add(CONJP, new string[][] { new string[] { Right, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.Adverb, PartsOfSpeech.PrepositionOrSubordinateConjunction } }); NonTerminalInfo.Add(FRAG, new string[][] { new string[] { Right } }); // crap NonTerminalInfo.Add(INTJ, new string[][] { new string[] { Left } }); NonTerminalInfo.Add(LST, new string[][] { new string[] { Right, PartsOfSpeech.ListItemMarker, PartsOfSpeech.ColonSemiColon } }); // NML is head in: (NAC-LOC (NML San Antonio) (, ,) (NNP Texas)) // TODO: NNP should be head (rare cases, could be ignored): // (NAC (NML New York) (NNP Court) (PP of Appeals)) // (NAC (NML Prudential Insurance) (NNP Co.) (PP Of America)) // Chris: This could maybe still do with more thought, but NAC is rare. NonTerminalInfo.Add(NAC, new string[][] { new string[] { Left, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, NML, PartsOfSpeech.ProperNounSingular, PartsOfSpeech.ProperNounPlural, CoordinationTransformer.Noun, NAC, PartsOfSpeech.ExistentialThere, PartsOfSpeech.DollarSign, PartsOfSpeech.CardinalNumber, QP, PartsOfSpeech.PersonalPronoun, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Adjective, JJP, PartsOfSpeech.ForeignWord } }); // Added JJ to PP head table, since it is a head in several cases, e.g.: // (PP (JJ next) (PP to them)) // When you have both JJ and IN daughters, it is invariably "such as" -- not so clear which should be head, but leave as IN // should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite))) Michel thinks we should make JJ a head of PP // added SYM as used in new treebanks for symbols filling role of IN // Changed PP search to left -- just what you want for conjunction (and consistent with SemanticHeadFinder) NonTerminalInfo.Add(PP, new string[][] { new string[] { Right, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.To, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Particle, PartsOfSpeech.ForeignWord, PartsOfSpeech.Adjective, PartsOfSpeech.Symbol }, new string[] { Left, PP } }); NonTerminalInfo.Add(PRN, new string[][] { new string[] { Left, VP, CoordinationTransformer.Noun, PP, SQ, S, SINV, SBAR, CoordinationTransformer.Adjective, JJP, ADVP, INTJ, WHNP, NAC, PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.Adjective, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.ProperNounSingular } }); NonTerminalInfo.Add(PRT, new string[][] { new string[] { Right, PartsOfSpeech.Particle } }); // add '#' for pounds!! NonTerminalInfo.Add(QP, new string[][] { new string[] { Left, PartsOfSpeech.DollarSign, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.Adjective, PartsOfSpeech.CardinalNumber, PartsOfSpeech.Predeterminer, PartsOfSpeech.Determiner, PartsOfSpeech.Adverb, NCD, QP, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.AdjectiveSuperlative } }); // reduced relative clause can be any predicate VP, ADJP, NP, PP. // For choosing between NP and PP, really need to know which one is temporal and to choose the other. // It's not clear ADVP needs to be in the list at all (delete?). NonTerminalInfo.Add(RRC, new string[][] { new string[] { Left, RRC }, new string[] { Right, VP, CoordinationTransformer.Adjective, JJP, CoordinationTransformer.Noun, PP, ADVP } }); // delete IN -- go for main part of sentence; add FRAG NonTerminalInfo.Add(S, new string[][] { new string[] { Left, PartsOfSpeech.To, VP, S, FRAG, SBAR, CoordinationTransformer.Adjective, JJP, UCP, CoordinationTransformer.Noun } }); NonTerminalInfo.Add(SBAR, new string[][] { new string[] { Left, WHNP, WHPP, WHADVP, WHADJP, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.Determiner, S, SQ, SINV, SBAR, FRAG } }); NonTerminalInfo.Add(SBARQ, new string[][] { new string[] { Left, SQ, S, SINV, SBARQ, FRAG, SBAR } }); // cdm: if you have 2 VP under an SINV, you should really take the 2nd as syntactic head, because the first is a topicalized VP complement of the second, but for now I didn't change this, since it didn't help parsing. (If it were changed, it'd need to be also changed to the opposite in SemanticHeadFinder.) NonTerminalInfo.Add(SINV, new string[][] { new string[] { Left, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.Modal, PartsOfSpeech.VerbPastParticiple, VP, S, SINV, CoordinationTransformer.Adjective, JJP, CoordinationTransformer.Noun } }); NonTerminalInfo.Add(SQ, new string[][] { new string[] { Left, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.Modal, AUX, AUXG, VP, SQ } }); // TODO: Should maybe put S before SQ for tag questions. Check. NonTerminalInfo.Add(UCP, new string[][] { new string[] { Right } }); // below is weird!! Make 2 lists, one for good and one for bad heads?? // VP: added AUX and AUXG to work with Charniak tags NonTerminalInfo.Add(VP, new string[][] { new string[] { Left, PartsOfSpeech.To, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Modal, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, VP, AUX, AUXG, CoordinationTransformer.Adjective, JJP, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, PartsOfSpeech.Adjective, CoordinationTransformer.Noun, PartsOfSpeech.ProperNounSingular } }); NonTerminalInfo.Add(WHADJP, new string[][] { new string[] { Left, PartsOfSpeech.WhAdverb, WHADVP, PartsOfSpeech.Adverb, PartsOfSpeech.Adjective, CoordinationTransformer.Adjective, JJP, PartsOfSpeech.AdjectiveComparative } }); NonTerminalInfo.Add(WHADVP, new string[][] { new string[] { Right, PartsOfSpeech.WhAdverb, WHADVP } }); NonTerminalInfo.Add(WHNP, new string[][] { new string[] { Left, PartsOfSpeech.WhDeterminer, PartsOfSpeech.WhPronoun, PartsOfSpeech.PossessiveWhPronoun, WHADJP, WHPP, WHNP } }); NonTerminalInfo.Add(WHPP, new string[][] { new string[] { Right, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.To, PartsOfSpeech.ForeignWord } }); NonTerminalInfo.Add(X, new string[][] { new string[] { Right, S, VP, CoordinationTransformer.Adjective, JJP, CoordinationTransformer.Noun, SBAR, PP, X } }); NonTerminalInfo.Add(CoordinationTransformer.Noun, new string[][] { new string[] { RightDis, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.ProperNounSingular, PartsOfSpeech.ProperNounPlural, PartsOfSpeech.NounPlural, NML, NX, PartsOfSpeech.PossessiveEnding, PartsOfSpeech.AdjectiveComparative }, new string[] { Left, CoordinationTransformer.Noun, PartsOfSpeech.PersonalPronoun }, new string[] { RightDis, PartsOfSpeech.DollarSign, CoordinationTransformer.Adjective, JJP, PRN, PartsOfSpeech.ForeignWord }, new string[] { Right, PartsOfSpeech.CardinalNumber }, new string[] { RightDis, PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Adverb, QP, PartsOfSpeech.Determiner, PartsOfSpeech.WhDeterminer, PartsOfSpeech.AdverbComparative, ADVP } }); NonTerminalInfo.Add(NX, NonTerminalInfo[CoordinationTransformer.Noun]); // TODO: seems JJ should be head of NML in this case: // (NP (NML (JJ former) (NML Red Sox) (JJ great)) (NNP Luis) (NNP Tiant)), // (although JJ great is tagged wrong) NonTerminalInfo.Add(NML, NonTerminalInfo[CoordinationTransformer.Noun]); NonTerminalInfo.Add(POSSP, new string[][] { new string[] { Right, PartsOfSpeech.PossessiveEnding } }); /* HJT: Adding the following to deal with oddly formed data in (for example) the Brown corpus */ NonTerminalInfo.Add(ROOT, new string[][] { new string[] { Left, S, SQ, SINV, SBAR, FRAG } }); // Just to handle trees which have TOP instead of ROOT at the root NonTerminalInfo.Add(TOP, NonTerminalInfo[ROOT]); NonTerminalInfo.Add(TYPO, new string[][] { new string[] { Left, PartsOfSpeech.NounSingularOrMass, CoordinationTransformer.Noun, NML, PartsOfSpeech.ProperNounSingular, PartsOfSpeech.ProperNounPlural, PartsOfSpeech.To, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Modal, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, VP, CoordinationTransformer.Adjective, JJP, FRAG } }); // for Brown (Roger) NonTerminalInfo.Add(ADV, new string[][] { new string[] { Right, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, PartsOfSpeech.ForeignWord, ADVP, PartsOfSpeech.To, PartsOfSpeech.CardinalNumber, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.Adjective, PartsOfSpeech.PrepositionOrSubordinateConjunction, CoordinationTransformer.Noun, "NML", PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.NounSingularOrMass } }); // SWBD NonTerminalInfo.Add(EDITED, new string[][] { new string[] { Left } }); // crap rule for Switchboard (if don't delete EDITED nodes) // in sw2756, a PartsOfSpeech.VerbBaseForm. (copy "VP" to handle this problem, though should really fix it on reading) NonTerminalInfo.Add(PartsOfSpeech.VerbBaseForm, new string[][] { new string[] { Left, PartsOfSpeech.To, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Modal, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, VP, AUX, AUXG, CoordinationTransformer.Adjective, JJP, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, PartsOfSpeech.Adjective, CoordinationTransformer.Noun, PartsOfSpeech.ProperNounSingular } }); NonTerminalInfo.Add(META, new string[][] { new string[] { Left } }); // rule for OntoNotes, but maybe should just be deleted in TreeReader?? NonTerminalInfo.Add(XS, new string[][] { new string[] { Right, PartsOfSpeech.PrepositionOrSubordinateConjunction } }); // rule for new structure in QP, introduced by Stanford in QPTreeTransformer // NonTerminalInfo.Add(null, new string[][] {{Left}}); // rule for OntoNotes from Michel, but it would be better to fix this in TreeReader or to use a default rule? // todo: Uncomment this line if we always want to take the leftmost if no head rule is defined for the mother category. // defaultRule = defaultLeftRule; // Don't exception, take leftmost if no rule defined for a certain parent category }
public ModCollinsHeadFinder(AbstractTreebankLanguagePack tlp) : base(tlp, tlp.PunctuationTags()) { // avoid punctuation as head in readonly default rule NonTerminalInfo = new Dictionary<string, string[][]>(); // This version from Collins' diss (1999: 236-238) // NNS, NN is actually sensible (money, etc.)! // QP early isn't; should prefer JJR NN RB // remove ADVP; it just shouldn't be there. // if two JJ, should take right one (e.g. South Korean) // NonTerminalInfo.Add(CoordinationTransformer.Adjective, new string[][]{{Left, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.DollarSign, "QP"}, {Right, PartsOfSpeech.Adjective}, {Left, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.VerbGerundOrPresentParticiple, CoordinationTransformer.Adjective, "JJP", PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Noun, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Determiner, PartsOfSpeech.ForeignWord, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, "SBAR", PartsOfSpeech.Adverb}}); NonTerminalInfo.Add(CoordinationTransformer.Adjective, new string[][] { new string[] {Left, PartsOfSpeech.DollarSign}, new string[] { RightDis, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.Adjective, QP, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.VerbGerundOrPresentParticiple }, new string[] {Left, CoordinationTransformer.Adjective}, new string[] { RightDis, JJP, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Determiner, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative, PartsOfSpeech.CardinalNumber, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.VerbPastTense }, new string[] {Left, ADVP, CoordinationTransformer.Noun} }); NonTerminalInfo.Add(JJP, new string[][] { new string[] { Left, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.DollarSign, QP, PartsOfSpeech.Adjective, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.VerbGerundOrPresentParticiple, CoordinationTransformer.Adjective, JJP, PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Noun, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Determiner, PartsOfSpeech.ForeignWord, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, SBAR, PartsOfSpeech.Adverb } }); // JJP is introduced for NML-like adjective phrases in Vadas' treebank; Chris wishes he hadn't used JJP which should be a POS-tag. // ADVP rule rewritten by Chris in Nov 2010 to be rightdis. This is right! JJ.* is often head and rightmost. NonTerminalInfo.Add(ADVP, new string[][] { new string[] {Left, ADVP, PartsOfSpeech.PrepositionOrSubordinateConjunction}, new string[] { RightDis, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.AdjectiveSuperlative }, new string[] { RightDis, PartsOfSpeech.Particle, PartsOfSpeech.Determiner, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.CardinalNumber, CoordinationTransformer.Noun, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.ProperNounSingular, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.ForeignWord, PartsOfSpeech.NounPlural, CoordinationTransformer.Adjective, NML } }); NonTerminalInfo.Add(CONJP, new string[][] { new string[] { Right, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.Adverb, PartsOfSpeech.PrepositionOrSubordinateConjunction } }); NonTerminalInfo.Add(FRAG, new string[][] {new string[] {Right}}); // crap NonTerminalInfo.Add(INTJ, new string[][] {new string[] {Left}}); NonTerminalInfo.Add(LST, new string[][] {new string[] {Right, PartsOfSpeech.ListItemMarker, PartsOfSpeech.ColonSemiColon}}); // NML is head in: (NAC-LOC (NML San Antonio) (, ,) (NNP Texas)) // TODO: NNP should be head (rare cases, could be ignored): // (NAC (NML New York) (NNP Court) (PP of Appeals)) // (NAC (NML Prudential Insurance) (NNP Co.) (PP Of America)) // Chris: This could maybe still do with more thought, but NAC is rare. NonTerminalInfo.Add(NAC, new string[][] { new string[] { Left, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, NML, PartsOfSpeech.ProperNounSingular, PartsOfSpeech.ProperNounPlural, CoordinationTransformer.Noun, NAC, PartsOfSpeech.ExistentialThere, PartsOfSpeech.DollarSign, PartsOfSpeech.CardinalNumber, QP, PartsOfSpeech.PersonalPronoun, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Adjective, JJP, PartsOfSpeech.ForeignWord } }); // Added JJ to PP head table, since it is a head in several cases, e.g.: // (PP (JJ next) (PP to them)) // When you have both JJ and IN daughters, it is invariably "such as" -- not so clear which should be head, but leave as IN // should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite))) Michel thinks we should make JJ a head of PP // added SYM as used in new treebanks for symbols filling role of IN // Changed PP search to left -- just what you want for conjunction (and consistent with SemanticHeadFinder) NonTerminalInfo.Add(PP, new string[][] { new string[] { Right, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.To, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Particle, PartsOfSpeech.ForeignWord, PartsOfSpeech.Adjective, PartsOfSpeech.Symbol }, new string[] {Left, PP} }); NonTerminalInfo.Add(PRN, new string[][] { new string[] { Left, VP, CoordinationTransformer.Noun, PP, SQ, S, SINV, SBAR, CoordinationTransformer.Adjective, JJP, ADVP, INTJ, WHNP, NAC, PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.Adjective, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.ProperNounSingular } }); NonTerminalInfo.Add(PRT, new string[][] {new string[] {Right, PartsOfSpeech.Particle}}); // add '#' for pounds!! NonTerminalInfo.Add(QP, new string[][] { new string[] { Left, PartsOfSpeech.DollarSign, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.Adjective, PartsOfSpeech.CardinalNumber, PartsOfSpeech.Predeterminer, PartsOfSpeech.Determiner, PartsOfSpeech.Adverb, NCD, QP, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.AdjectiveSuperlative } }); // reduced relative clause can be any predicate VP, ADJP, NP, PP. // For choosing between NP and PP, really need to know which one is temporal and to choose the other. // It's not clear ADVP needs to be in the list at all (delete?). NonTerminalInfo.Add(RRC, new string[][] { new string[] { Left, RRC }, new string[] { Right, VP, CoordinationTransformer.Adjective, JJP, CoordinationTransformer.Noun, PP, ADVP } }); // delete IN -- go for main part of sentence; add FRAG NonTerminalInfo.Add(S, new string[][] { new string[] { Left, PartsOfSpeech.To, VP, S, FRAG, SBAR, CoordinationTransformer.Adjective, JJP, UCP, CoordinationTransformer.Noun } }); NonTerminalInfo.Add(SBAR, new string[][] { new string[] { Left, WHNP, WHPP, WHADVP, WHADJP, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.Determiner, S, SQ, SINV, SBAR, FRAG } }); NonTerminalInfo.Add(SBARQ, new string[][] {new string[] {Left, SQ, S, SINV, SBARQ, FRAG, SBAR}}); // cdm: if you have 2 VP under an SINV, you should really take the 2nd as syntactic head, because the first is a topicalized VP complement of the second, but for now I didn't change this, since it didn't help parsing. (If it were changed, it'd need to be also changed to the opposite in SemanticHeadFinder.) NonTerminalInfo.Add(SINV, new string[][] { new string[] { Left, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.Modal, PartsOfSpeech.VerbPastParticiple, VP, S, SINV, CoordinationTransformer.Adjective, JJP, CoordinationTransformer.Noun } }); NonTerminalInfo.Add(SQ, new string[][] { new string[] { Left, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.Modal, AUX, AUXG, VP, SQ } }); // TODO: Should maybe put S before SQ for tag questions. Check. NonTerminalInfo.Add(UCP, new string[][] {new string[] {Right}}); // below is weird!! Make 2 lists, one for good and one for bad heads?? // VP: added AUX and AUXG to work with Charniak tags NonTerminalInfo.Add(VP, new string[][] { new string[] { Left, PartsOfSpeech.To, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Modal, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, VP, AUX, AUXG, CoordinationTransformer.Adjective, JJP, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, PartsOfSpeech.Adjective, CoordinationTransformer.Noun, PartsOfSpeech.ProperNounSingular } }); NonTerminalInfo.Add(WHADJP, new string[][] { new string[] { Left, PartsOfSpeech.WhAdverb, WHADVP, PartsOfSpeech.Adverb, PartsOfSpeech.Adjective, CoordinationTransformer.Adjective, JJP, PartsOfSpeech.AdjectiveComparative } }); NonTerminalInfo.Add(WHADVP, new string[][] {new string[] {Right, PartsOfSpeech.WhAdverb, WHADVP}}); NonTerminalInfo.Add(WHNP, new string[][] { new string[] { Left, PartsOfSpeech.WhDeterminer, PartsOfSpeech.WhPronoun, PartsOfSpeech.PossessiveWhPronoun, WHADJP, WHPP, WHNP } }); NonTerminalInfo.Add(WHPP, new string[][] { new string[] { Right, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.To, PartsOfSpeech.ForeignWord } }); NonTerminalInfo.Add(X, new string[][] { new string[] { Right, S, VP, CoordinationTransformer.Adjective, JJP, CoordinationTransformer.Noun, SBAR, PP, X } }); NonTerminalInfo.Add(CoordinationTransformer.Noun, new string[][] { new string[] { RightDis, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.ProperNounSingular, PartsOfSpeech.ProperNounPlural, PartsOfSpeech.NounPlural, NML, NX, PartsOfSpeech.PossessiveEnding, PartsOfSpeech.AdjectiveComparative }, new string[] {Left, CoordinationTransformer.Noun, PartsOfSpeech.PersonalPronoun}, new string[] {RightDis, PartsOfSpeech.DollarSign, CoordinationTransformer.Adjective, JJP, PRN, PartsOfSpeech.ForeignWord}, new string[] {Right, PartsOfSpeech.CardinalNumber}, new string[] { RightDis, PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Adverb, QP, PartsOfSpeech.Determiner, PartsOfSpeech.WhDeterminer, PartsOfSpeech.AdverbComparative, ADVP } }); NonTerminalInfo.Add(NX, NonTerminalInfo[CoordinationTransformer.Noun]); // TODO: seems JJ should be head of NML in this case: // (NP (NML (JJ former) (NML Red Sox) (JJ great)) (NNP Luis) (NNP Tiant)), // (although JJ great is tagged wrong) NonTerminalInfo.Add(NML, NonTerminalInfo[CoordinationTransformer.Noun]); NonTerminalInfo.Add(POSSP, new string[][] {new string[] {Right, PartsOfSpeech.PossessiveEnding}}); /* HJT: Adding the following to deal with oddly formed data in (for example) the Brown corpus */ NonTerminalInfo.Add(ROOT, new string[][] {new string[] {Left, S, SQ, SINV, SBAR, FRAG}}); // Just to handle trees which have TOP instead of ROOT at the root NonTerminalInfo.Add(TOP, NonTerminalInfo[ROOT]); NonTerminalInfo.Add(TYPO, new string[][] { new string[] { Left, PartsOfSpeech.NounSingularOrMass, CoordinationTransformer.Noun, NML, PartsOfSpeech.ProperNounSingular, PartsOfSpeech.ProperNounPlural, PartsOfSpeech.To, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Modal, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, VP, CoordinationTransformer.Adjective, JJP, FRAG } }); // for Brown (Roger) NonTerminalInfo.Add(ADV, new string[][] { new string[] { Right, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, PartsOfSpeech.ForeignWord, ADVP, PartsOfSpeech.To, PartsOfSpeech.CardinalNumber, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.Adjective, PartsOfSpeech.PrepositionOrSubordinateConjunction, CoordinationTransformer.Noun, "NML", PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.NounSingularOrMass } }); // SWBD NonTerminalInfo.Add(EDITED, new string[][] {new string[] {Left}}); // crap rule for Switchboard (if don't delete EDITED nodes) // in sw2756, a PartsOfSpeech.VerbBaseForm. (copy "VP" to handle this problem, though should really fix it on reading) NonTerminalInfo.Add(PartsOfSpeech.VerbBaseForm, new string[][] { new string[] { Left, PartsOfSpeech.To, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Modal, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, VP, AUX, AUXG, CoordinationTransformer.Adjective, JJP, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, PartsOfSpeech.Adjective, CoordinationTransformer.Noun, PartsOfSpeech.ProperNounSingular } }); NonTerminalInfo.Add(META, new string[][] {new string[] {Left}}); // rule for OntoNotes, but maybe should just be deleted in TreeReader?? NonTerminalInfo.Add(XS, new string[][] {new string[] {Right, PartsOfSpeech.PrepositionOrSubordinateConjunction}}); // rule for new structure in QP, introduced by Stanford in QPTreeTransformer // NonTerminalInfo.Add(null, new string[][] {{Left}}); // rule for OntoNotes from Michel, but it would be better to fix this in TreeReader or to use a default rule? // todo: Uncomment this line if we always want to take the leftmost if no head rule is defined for the mother category. // defaultRule = defaultLeftRule; // Don't exception, take leftmost if no rule defined for a certain parent category }
public CollinsHeadFinder(AbstractTreebankLanguagePack tlp, string[] categoriesToAvoid) : base(tlp, categoriesToAvoid) { NonTerminalInfo = new Dictionary<string, string[][]>(); // This version from Collins' diss (1999: 236-238) NonTerminalInfo.Add(CoordinationTransformer.Adjective, new string[][] { new string[] { Left, PartsOfSpeech.NounPlural, QP, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.DollarSign, ADVP, PartsOfSpeech.Adjective, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.VerbGerundOrPresentParticiple, CoordinationTransformer.Adjective, PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Noun, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Determiner, PartsOfSpeech.ForeignWord, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, SBAR, PartsOfSpeech.Adverb } }); NonTerminalInfo.Add(ADVP, new string[][] { new string[] { Right, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, PartsOfSpeech.ForeignWord, ADVP, PartsOfSpeech.To, PartsOfSpeech.CardinalNumber, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.Adjective, PartsOfSpeech.PrepositionOrSubordinateConjunction, CoordinationTransformer.Noun, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.NounSingularOrMass } }); NonTerminalInfo.Add(CONJP, new string[][] { new string[] { Right, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.Adverb, PartsOfSpeech.PrepositionOrSubordinateConjunction } }); NonTerminalInfo.Add(FRAG, new string[][] {new string[] {Right}}); // crap NonTerminalInfo.Add(INTJ, new string[][] {new string[] {Left}}); NonTerminalInfo.Add(LST, new string[][] {new string[] {Right, PartsOfSpeech.ListItemMarker, PartsOfSpeech.ColonSemiColon}}); NonTerminalInfo.Add(NAC, new string[][] { new string[] { Left, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, PartsOfSpeech.ProperNounSingular, PartsOfSpeech.ProperNounPlural, CoordinationTransformer.Noun, NAC, PartsOfSpeech.ExistentialThere, PartsOfSpeech.DollarSign, PartsOfSpeech.CardinalNumber, QP, PartsOfSpeech.PersonalPronoun, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Adjective, PartsOfSpeech.ForeignWord } }); NonTerminalInfo.Add(NX, new string[][] {new string[] {Left}}); // crap NonTerminalInfo.Add(PP, new string[][] { new string[] { Right, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.To, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Particle, PartsOfSpeech.ForeignWord } }); // should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite))) NonTerminalInfo.Add(PRN, new string[][] {new string[] {Left}}); NonTerminalInfo.Add(PRT, new string[][] {new string[] {Right, PartsOfSpeech.Particle}}); NonTerminalInfo.Add(QP, new string[][] { new string[] { Left, PartsOfSpeech.DollarSign, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.Adjective, PartsOfSpeech.Adverb, PartsOfSpeech.Determiner, PartsOfSpeech.CardinalNumber, NCD, QP, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.AdjectiveSuperlative } }); NonTerminalInfo.Add(RRC, new string[][] { new string[] { Right, AbstractCollinsHeadFinder.VerbPhrase, CoordinationTransformer.Noun, ADVP, CoordinationTransformer.Adjective, PP } }); NonTerminalInfo.Add(S, new string[][] { new string[] { Left, PartsOfSpeech.To, PartsOfSpeech.PrepositionOrSubordinateConjunction, AbstractCollinsHeadFinder.VerbPhrase, S, SBAR, CoordinationTransformer.Adjective, UCP, CoordinationTransformer.Noun } }); NonTerminalInfo.Add(SBAR, new string[][] { new string[] { Left, WHNP, WHPP, WHADVP, WHADJP, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.Determiner, S, SQ, SINV, SBAR, FRAG } }); NonTerminalInfo.Add(SBARQ, new string[][] {new string[] {Left, SQ, S, SINV, SBARQ, FRAG}}); NonTerminalInfo.Add(SINV, new string[][] { new string[] { Left, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.Modal, AbstractCollinsHeadFinder.VerbPhrase, S, SINV, CoordinationTransformer.Adjective, CoordinationTransformer.Noun } }); NonTerminalInfo.Add(SQ, new string[][] { new string[] { Left, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.Modal, AbstractCollinsHeadFinder.VerbPhrase, SQ } }); NonTerminalInfo.Add("UCP", new string[][] {new string[] {Right}}); NonTerminalInfo.Add(AbstractCollinsHeadFinder.VerbPhrase, new string[][] { new string[] { Left, PartsOfSpeech.To, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Modal, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, AUX, AUXG, AbstractCollinsHeadFinder.VerbPhrase, CoordinationTransformer.Adjective, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, CoordinationTransformer.Noun } }); NonTerminalInfo.Add(WHADJP, new string[][] { new string[] { Left, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.WhAdverb, PartsOfSpeech.Adjective, CoordinationTransformer.Adjective } }); NonTerminalInfo.Add(WHADVP, new string[][] {new string[] {Right, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.WhAdverb}}); NonTerminalInfo.Add(WHNP, new string[][] { new string[] { Left, PartsOfSpeech.WhDeterminer, PartsOfSpeech.WhPronoun, PartsOfSpeech.PossessiveWhPronoun, WHADJP, WHPP, WHNP } }); NonTerminalInfo.Add(WHPP, new string[][] { new string[] { Right, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.To, PartsOfSpeech.ForeignWord } }); NonTerminalInfo.Add(X, new string[][] {new string[] {Right}}); // crap rule NonTerminalInfo.Add(CoordinationTransformer.Noun, new string[][] { new string[] { RightDis, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.ProperNounSingular, PartsOfSpeech.ProperNounPlural, PartsOfSpeech.NounPlural, NX, PartsOfSpeech.PossessiveEnding, PartsOfSpeech.AdjectiveComparative }, new string[] {Left, CoordinationTransformer.Noun}, new string[] {RightDis, PartsOfSpeech.DollarSign, CoordinationTransformer.Adjective, PRN}, new string[] {Right, PartsOfSpeech.CardinalNumber}, new string[] { RightDis, PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Adverb, QP } }); NonTerminalInfo.Add(TYPO, new string[][] {new string[] {Left}}); // another crap rule, for Brown (Roger) NonTerminalInfo.Add(EDITED, new string[][] {new string[] {Left}}); // crap rule for Switchboard (if don't delete EDITED nodes) NonTerminalInfo.Add(XS, new string[][] {new string[] {Right, PartsOfSpeech.PrepositionOrSubordinateConjunction}}); // rule for new structure in QP }
public CollinsHeadFinder(AbstractTreebankLanguagePack tlp, string[] categoriesToAvoid) : base(tlp, categoriesToAvoid) { NonTerminalInfo = new Dictionary <string, string[][]>(); // This version from Collins' diss (1999: 236-238) NonTerminalInfo.Add(CoordinationTransformer.Adjective, new string[][] { new string[] { Left, PartsOfSpeech.NounPlural, QP, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.DollarSign, ADVP, PartsOfSpeech.Adjective, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.VerbGerundOrPresentParticiple, CoordinationTransformer.Adjective, PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Noun, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Determiner, PartsOfSpeech.ForeignWord, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, SBAR, PartsOfSpeech.Adverb } }); NonTerminalInfo.Add(ADVP, new string[][] { new string[] { Right, PartsOfSpeech.Adverb, PartsOfSpeech.AdverbComparative, PartsOfSpeech.AdverbSuperlative, PartsOfSpeech.ForeignWord, ADVP, PartsOfSpeech.To, PartsOfSpeech.CardinalNumber, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.Adjective, PartsOfSpeech.PrepositionOrSubordinateConjunction, CoordinationTransformer.Noun, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.NounSingularOrMass } }); NonTerminalInfo.Add(CONJP, new string[][] { new string[] { Right, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.Adverb, PartsOfSpeech.PrepositionOrSubordinateConjunction } }); NonTerminalInfo.Add(FRAG, new string[][] { new string[] { Right } }); // crap NonTerminalInfo.Add(INTJ, new string[][] { new string[] { Left } }); NonTerminalInfo.Add(LST, new string[][] { new string[] { Right, PartsOfSpeech.ListItemMarker, PartsOfSpeech.ColonSemiColon } }); NonTerminalInfo.Add(NAC, new string[][] { new string[] { Left, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, PartsOfSpeech.ProperNounSingular, PartsOfSpeech.ProperNounPlural, CoordinationTransformer.Noun, NAC, PartsOfSpeech.ExistentialThere, PartsOfSpeech.DollarSign, PartsOfSpeech.CardinalNumber, QP, PartsOfSpeech.PersonalPronoun, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.AdjectiveComparative, CoordinationTransformer.Adjective, PartsOfSpeech.ForeignWord } }); NonTerminalInfo.Add(NX, new string[][] { new string[] { Left } }); // crap NonTerminalInfo.Add(PP, new string[][] { new string[] { Right, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.To, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Particle, PartsOfSpeech.ForeignWord } }); // should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite))) NonTerminalInfo.Add(PRN, new string[][] { new string[] { Left } }); NonTerminalInfo.Add(PRT, new string[][] { new string[] { Right, PartsOfSpeech.Particle } }); NonTerminalInfo.Add(QP, new string[][] { new string[] { Left, PartsOfSpeech.DollarSign, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.NounPlural, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.Adjective, PartsOfSpeech.Adverb, PartsOfSpeech.Determiner, PartsOfSpeech.CardinalNumber, NCD, QP, PartsOfSpeech.AdjectiveComparative, PartsOfSpeech.AdjectiveSuperlative } }); NonTerminalInfo.Add(RRC, new string[][] { new string[] { Right, AbstractCollinsHeadFinder.VerbPhrase, CoordinationTransformer.Noun, ADVP, CoordinationTransformer.Adjective, PP } }); NonTerminalInfo.Add(S, new string[][] { new string[] { Left, PartsOfSpeech.To, PartsOfSpeech.PrepositionOrSubordinateConjunction, AbstractCollinsHeadFinder.VerbPhrase, S, SBAR, CoordinationTransformer.Adjective, UCP, CoordinationTransformer.Noun } }); NonTerminalInfo.Add(SBAR, new string[][] { new string[] { Left, WHNP, WHPP, WHADVP, WHADJP, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.Determiner, S, SQ, SINV, SBAR, FRAG } }); NonTerminalInfo.Add(SBARQ, new string[][] { new string[] { Left, SQ, S, SINV, SBARQ, FRAG } }); NonTerminalInfo.Add(SINV, new string[][] { new string[] { Left, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.Modal, AbstractCollinsHeadFinder.VerbPhrase, S, SINV, CoordinationTransformer.Adjective, CoordinationTransformer.Noun } }); NonTerminalInfo.Add(SQ, new string[][] { new string[] { Left, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbNon3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.Modal, AbstractCollinsHeadFinder.VerbPhrase, SQ } }); NonTerminalInfo.Add("UCP", new string[][] { new string[] { Right } }); NonTerminalInfo.Add(AbstractCollinsHeadFinder.VerbPhrase, new string[][] { new string[] { Left, PartsOfSpeech.To, PartsOfSpeech.VerbPastTense, PartsOfSpeech.VerbPastParticiple, PartsOfSpeech.Modal, PartsOfSpeech.Verb3rdPersSingPresent, PartsOfSpeech.VerbBaseForm, PartsOfSpeech.VerbGerundOrPresentParticiple, PartsOfSpeech.VerbNon3rdPersSingPresent, AUX, AUXG, AbstractCollinsHeadFinder.VerbPhrase, CoordinationTransformer.Adjective, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.NounPlural, CoordinationTransformer.Noun } }); NonTerminalInfo.Add(WHADJP, new string[][] { new string[] { Left, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.WhAdverb, PartsOfSpeech.Adjective, CoordinationTransformer.Adjective } }); NonTerminalInfo.Add(WHADVP, new string[][] { new string[] { Right, PartsOfSpeech.CoordinatingConjunction, PartsOfSpeech.WhAdverb } }); NonTerminalInfo.Add(WHNP, new string[][] { new string[] { Left, PartsOfSpeech.WhDeterminer, PartsOfSpeech.WhPronoun, PartsOfSpeech.PossessiveWhPronoun, WHADJP, WHPP, WHNP } }); NonTerminalInfo.Add(WHPP, new string[][] { new string[] { Right, PartsOfSpeech.PrepositionOrSubordinateConjunction, PartsOfSpeech.To, PartsOfSpeech.ForeignWord } }); NonTerminalInfo.Add(X, new string[][] { new string[] { Right } }); // crap rule NonTerminalInfo.Add(CoordinationTransformer.Noun, new string[][] { new string[] { RightDis, PartsOfSpeech.NounSingularOrMass, PartsOfSpeech.ProperNounSingular, PartsOfSpeech.ProperNounPlural, PartsOfSpeech.NounPlural, NX, PartsOfSpeech.PossessiveEnding, PartsOfSpeech.AdjectiveComparative }, new string[] { Left, CoordinationTransformer.Noun }, new string[] { RightDis, PartsOfSpeech.DollarSign, CoordinationTransformer.Adjective, PRN }, new string[] { Right, PartsOfSpeech.CardinalNumber }, new string[] { RightDis, PartsOfSpeech.Adjective, PartsOfSpeech.AdjectiveSuperlative, PartsOfSpeech.Adverb, QP } }); NonTerminalInfo.Add(TYPO, new string[][] { new string[] { Left } }); // another crap rule, for Brown (Roger) NonTerminalInfo.Add(EDITED, new string[][] { new string[] { Left } }); // crap rule for Switchboard (if don't delete EDITED nodes) NonTerminalInfo.Add(XS, new string[][] { new string[] { Right, PartsOfSpeech.PrepositionOrSubordinateConjunction } }); // rule for new structure in QP }
public ModCollinsHeadFinder(AbstractTreebankLanguagePack tlp) : base(tlp, tlp.PunctuationTags()) { // avoid punctuation as head in readonly default rule nonTerminalInfo = new Dictionary <string, string[][]>(); // This version from Collins' diss (1999: 236-238) // NNS, NN is actually sensible (money, etc.)! // QP early isn't; should prefer JJR NN RB // remove ADVP; it just shouldn't be there. // if two JJ, should take right one (e.g. South Korean) // nonTerminalInfo.Add("ADJP", new string[][]{{"left", "NNS", "NN", "$", "QP"}, {"right", "JJ"}, {"left", "VBN", "VBG", "ADJP", "JJP", "JJR", "NP", "JJS", "DT", "FW", "RBR", "RBS", "SBAR", "RB"}}); nonTerminalInfo.Add("ADJP", new string[][] { new string[] { "left", "$" }, new string[] { "rightdis", "NNS", "NN", "JJ", "QP", "VBN", "VBG" }, new string[] { "left", "ADJP" }, new string[] { "rightdis", "JJP", "JJR", "JJS", "DT", "RB", "RBR", "CD", "IN", "VBD" }, new string[] { "left", "ADVP", "NP" } }); nonTerminalInfo.Add("JJP", new string[][] { new string[] { "left", "NNS", "NN", "$", "QP", "JJ", "VBN", "VBG", "ADJP", "JJP", "JJR", "NP", "JJS", "DT", "FW", "RBR", "RBS", "SBAR", "RB" } }); // JJP is introduced for NML-like adjective phrases in Vadas' treebank; Chris wishes he hadn't used JJP which should be a POS-tag. // ADVP rule rewritten by Chris in Nov 2010 to be rightdis. This is right! JJ.* is often head and rightmost. nonTerminalInfo.Add("ADVP", new string[][] { new string[] { "left", "ADVP", "IN" }, new string[] { "rightdis", "RB", "RBR", "RBS", "JJ", "JJR", "JJS" }, new string[] { "rightdis", "RP", "DT", "NN", "CD", "NP", "VBN", "NNP", "CC", "FW", "NNS", "ADJP", "NML" } }); nonTerminalInfo.Add("CONJP", new string[][] { new string[] { "right", "CC", "RB", "IN" } }); nonTerminalInfo.Add("FRAG", new string[][] { new string[] { "right" } }); // crap nonTerminalInfo.Add("INTJ", new string[][] { new string[] { "left" } }); nonTerminalInfo.Add("LST", new string[][] { new string[] { "right", "LS", ":" } }); // NML is head in: (NAC-LOC (NML San Antonio) (, ,) (NNP Texas)) // TODO: NNP should be head (rare cases, could be ignored): // (NAC (NML New York) (NNP Court) (PP of Appeals)) // (NAC (NML Prudential Insurance) (NNP Co.) (PP Of America)) // Chris: This could maybe still do with more thought, but NAC is rare. nonTerminalInfo.Add("NAC", new string[][] { new string[] { "left", "NN", "NNS", "NML", "NNP", "NNPS", "NP", "NAC", "EX", "$", "CD", "QP", "PRP", "VBG", "JJ", "JJS", "JJR", "ADJP", "JJP", "FW" } }); // Added JJ to PP head table, since it is a head in several cases, e.g.: // (PP (JJ next) (PP to them)) // When you have both JJ and IN daughters, it is invariably "such as" -- not so clear which should be head, but leave as IN // should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite))) Michel thinks we should make JJ a head of PP // added SYM as used in new treebanks for symbols filling role of IN // Changed PP search to left -- just what you want for conjunction (and consistent with SemanticHeadFinder) nonTerminalInfo.Add("PP", new string[][] { new string[] { "right", "IN", "TO", "VBG", "VBN", "RP", "FW", "JJ", "SYM" }, new string[] { "left", "PP" } }); nonTerminalInfo.Add("PRN", new string[][] { new string[] { "left", "VP", "NP", "PP", "SQ", "S", "SINV", "SBAR", "ADJP", "JJP", "ADVP", "INTJ", "WHNP", "NAC", "VBP", "JJ", "NN", "NNP" } }); nonTerminalInfo.Add("PRT", new string[][] { new string[] { "right", "RP" } }); // add '#' for pounds!! nonTerminalInfo.Add("QP", new string[][] { new string[] { "left", "$", "IN", "NNS", "NN", "JJ", "CD", "PDT", "DT", "RB", "NCD", "QP", "JJR", "JJS" } }); // reduced relative clause can be any predicate VP, ADJP, NP, PP. // For choosing between NP and PP, really need to know which one is temporal and to choose the other. // It's not clear ADVP needs to be in the list at all (delete?). nonTerminalInfo.Add("RRC", new string[][] { new string[] { "left", "RRC" }, new string[] { "right", "VP", "ADJP", "JJP", "NP", "PP", "ADVP" } }); // delete IN -- go for main part of sentence; add FRAG nonTerminalInfo.Add("S", new string[][] { new string[] { "left", "TO", "VP", "S", "FRAG", "SBAR", "ADJP", "JJP", "UCP", "NP" } }); nonTerminalInfo.Add("SBAR", new string[][] { new string[] { "left", "WHNP", "WHPP", "WHADVP", "WHADJP", "IN", "DT", "S", "SQ", "SINV", "SBAR", "FRAG" } }); nonTerminalInfo.Add("SBARQ", new string[][] { new string[] { "left", "SQ", "S", "SINV", "SBARQ", "FRAG", "SBAR" } }); // cdm: if you have 2 VP under an SINV, you should really take the 2nd as syntactic head, because the first is a topicalized VP complement of the second, but for now I didn't change this, since it didn't help parsing. (If it were changed, it'd need to be also changed to the opposite in SemanticHeadFinder.) nonTerminalInfo.Add("SINV", new string[][] { new string[] { "left", "VBZ", "VBD", "VBP", "VB", "MD", "VBN", "VP", "S", "SINV", "ADJP", "JJP", "NP" } }); nonTerminalInfo.Add("SQ", new string[][] { new string[] { "left", "VBZ", "VBD", "VBP", "VB", "MD", "AUX", "AUXG", "VP", "SQ" } }); // TODO: Should maybe put S before SQ for tag questions. Check. nonTerminalInfo.Add("UCP", new string[][] { new string[] { "right" } }); // below is weird!! Make 2 lists, one for good and one for bad heads?? // VP: added AUX and AUXG to work with Charniak tags nonTerminalInfo.Add("VP", new string[][] { new string[] { "left", "TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "VP", "AUX", "AUXG", "ADJP", "JJP", "NN", "NNS", "JJ", "NP", "NNP" } }); nonTerminalInfo.Add("WHADJP", new string[][] { new string[] { "left", "WRB", "WHADVP", "RB", "JJ", "ADJP", "JJP", "JJR" } }); nonTerminalInfo.Add("WHADVP", new string[][] { new string[] { "right", "WRB", "WHADVP" } }); nonTerminalInfo.Add("WHNP", new string[][] { new string[] { "left", "WDT", "WP", "WP$", "WHADJP", "WHPP", "WHNP" } }); nonTerminalInfo.Add("WHPP", new string[][] { new string[] { "right", "IN", "TO", "FW" } }); nonTerminalInfo.Add("X", new string[][] { new string[] { "right", "S", "VP", "ADJP", "JJP", "NP", "SBAR", "PP", "X" } }); nonTerminalInfo.Add("NP", new string[][] { new string[] { "rightdis", "NN", "NNP", "NNPS", "NNS", "NML", "NX", "POS", "JJR" }, new string[] { "left", "NP", "PRP" }, new string[] { "rightdis", "$", "ADJP", "JJP", "PRN", "FW" }, new string[] { "right", "CD" }, new string[] { "rightdis", "JJ", "JJS", "RB", "QP", "DT", "WDT", "RBR", "ADVP" } }); nonTerminalInfo.Add("NX", nonTerminalInfo["NP"]); // TODO: seems JJ should be head of NML in this case: // (NP (NML (JJ former) (NML Red Sox) (JJ great)) (NNP Luis) (NNP Tiant)), // (although JJ great is tagged wrong) nonTerminalInfo.Add("NML", nonTerminalInfo["NP"]); nonTerminalInfo.Add("POSSP", new string[][] { new string[] { "right", "POS" } }); /* HJT: Adding the following to deal with oddly formed data in (for example) the Brown corpus */ nonTerminalInfo.Add("ROOT", new string[][] { new string[] { "left", "S", "SQ", "SINV", "SBAR", "FRAG" } }); // Just to handle trees which have TOP instead of ROOT at the root nonTerminalInfo.Add("TOP", nonTerminalInfo["ROOT"]); nonTerminalInfo.Add("TYPO", new string[][] { new string[] { "left", "NN", "NP", "NML", "NNP", "NNPS", "TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "VP", "ADJP", "JJP", "FRAG" } }); // for Brown (Roger) nonTerminalInfo.Add("ADV", new string[][] { new string[] { "right", "RB", "RBR", "RBS", "FW", "ADVP", "TO", "CD", "JJR", "JJ", "IN", "NP", "NML", "JJS", "NN" } }); // SWBD nonTerminalInfo.Add("EDITED", new string[][] { new string[] { "left" } }); // crap rule for Switchboard (if don't delete EDITED nodes) // in sw2756, a "VB". (copy "VP" to handle this problem, though should really fix it on reading) nonTerminalInfo.Add("VB", new string[][] { new string[] { "left", "TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "VP", "AUX", "AUXG", "ADJP", "JJP", "NN", "NNS", "JJ", "NP", "NNP" } }); nonTerminalInfo.Add("META", new string[][] { new string[] { "left" } }); // rule for OntoNotes, but maybe should just be deleted in TreeReader?? nonTerminalInfo.Add("XS", new string[][] { new string[] { "right", "IN" } }); // rule for new structure in QP, introduced by Stanford in QPTreeTransformer // nonTerminalInfo.Add(null, new string[][] {{"left"}}); // rule for OntoNotes from Michel, but it would be better to fix this in TreeReader or to use a default rule? // todo: Uncomment this line if we always want to take the leftmost if no head rule is defined for the mother category. // defaultRule = defaultLeftRule; // Don't exception, take leftmost if no rule defined for a certain parent category }