Exemple #1
0
 private ICollection <string> GetPunctuationTags()
 {
     if (tlp is PennTreebankLanguagePack)
     {
         // Hack for English: match punctuation tags used in Danqi's paper
         return(new HashSet <string>(Arrays.AsList("''", ",", ".", ":", "``", "-LRB-", "-RRB-")));
     }
     else
     {
         return(CollectionUtils.AsSet(tlp.PunctuationTags()));
     }
 }
Exemple #2
0
 public ModCollinsHeadFinder(ITreebankLanguagePack tlp)
     : base(tlp, tlp.PunctuationTags())
 {
     // avoid punctuation as head in final default rule
     nonTerminalInfo = Generics.NewHashMap();
     // This version from Collins' diss (1999: 236-238)
     // NNS, NN is actually sensible (money, etc.)!
     // QP early isn't; should prefer JJR NN RB
     // remove ADVP; it just shouldn't be there.
     // if two JJ, should take right one (e.g. South Korean)
     // nonTerminalInfo.put("ADJP", new String[][]{{"left", "NNS", "NN", "$", "QP"}, {"right", "JJ"}, {"left", "VBN", "VBG", "ADJP", "JJP", "JJR", "NP", "JJS", "DT", "FW", "RBR", "RBS", "SBAR", "RB"}});
     nonTerminalInfo["ADJP"] = new string[][] { new string[] { "left", "$" }, new string[] { "rightdis", "NNS", "NN", "JJ", "QP", "VBN", "VBG" }, new string[] { "left", "ADJP" }, new string[] { "rightdis", "JJP", "JJR", "JJS", "DT", "RB", "RBR",
                                                                                                                                                                                                  "CD", "IN", "VBD" }, new string[] { "left", "ADVP", "NP" } };
     nonTerminalInfo["JJP"] = new string[][] { new string[] { "left", "NNS", "NN", "$", "QP", "JJ", "VBN", "VBG", "ADJP", "JJP", "JJR", "NP", "JJS", "DT", "FW", "RBR", "RBS", "SBAR", "RB" } };
     // JJP is introduced for NML-like adjective phrases in Vadas' treebank; Chris wishes he hadn't used JJP which should be a POS-tag.
     // ADVP rule rewritten by Chris in Nov 2010 to be rightdis.  This is right! JJ.* is often head and rightmost.
     nonTerminalInfo["ADVP"] = new string[][] { new string[] { "left", "ADVP", "IN" }, new string[] { "rightdis", "RB", "RBR", "RBS", "JJ", "JJR", "JJS" }, new string[] { "rightdis", "RP", "DT", "NN", "CD", "NP", "VBN", "NNP", "CC", "FW", "NNS",
                                                                                                                                                                           "ADJP", "NML" } };
     nonTerminalInfo["CONJP"] = new string[][] { new string[] { "right", "CC", "RB", "IN" } };
     nonTerminalInfo["FRAG"]  = new string[][] { new string[] { "right" } };
     // crap
     nonTerminalInfo["INTJ"] = new string[][] { new string[] { "left" } };
     nonTerminalInfo["LST"]  = new string[][] { new string[] { "right", "LS", ":" } };
     // NML is head in: (NAC-LOC (NML San Antonio) (, ,) (NNP Texas))
     // TODO: NNP should be head (rare cases, could be ignored):
     //   (NAC (NML New York) (NNP Court) (PP of Appeals))
     //   (NAC (NML Prudential Insurance) (NNP Co.) (PP Of America))
     // Chris: This could maybe still do with more thought, but NAC is rare.
     nonTerminalInfo["NAC"] = new string[][] { new string[] { "left", "NN", "NNS", "NML", "NNP", "NNPS", "NP", "NAC", "EX", "$", "CD", "QP", "PRP", "VBG", "JJ", "JJS", "JJR", "ADJP", "JJP", "FW" } };
     // Added JJ to PP head table, since it is a head in several cases, e.g.:
     // (PP (JJ next) (PP to them))
     // When you have both JJ and IN daughters, it is invariably "such as" -- not so clear which should be head, but leave as IN
     // should prefer JJ? (PP (JJ such) (IN as) (NP (NN crocidolite)))  Michel thinks we should make JJ a head of PP
     // added SYM as used in new treebanks for symbols filling role of IN
     // Changed PP search to left -- just what you want for conjunction (and consistent with SemanticHeadFinder)
     nonTerminalInfo["PP"]  = new string[][] { new string[] { "right", "IN", "TO", "VBG", "VBN", "RP", "FW", "JJ", "SYM" }, new string[] { "left", "PP" } };
     nonTerminalInfo["PRN"] = new string[][] { new string[] { "left", "VP", "NP", "PP", "SQ", "S", "SINV", "SBAR", "ADJP", "JJP", "ADVP", "INTJ", "WHNP", "NAC", "VBP", "JJ", "NN", "NNP" } };
     nonTerminalInfo["PRT"] = new string[][] { new string[] { "right", "RP" } };
     // add '#' for pounds!!
     nonTerminalInfo["QP"] = new string[][] { new string[] { "left", "$", "IN", "NNS", "NN", "JJ", "CD", "PDT", "DT", "RB", "NCD", "QP", "JJR", "JJS" } };
     // reduced relative clause can be any predicate VP, ADJP, NP, PP.
     // For choosing between NP and PP, really need to know which one is temporal and to choose the other.
     // It's not clear ADVP needs to be in the list at all (delete?).
     nonTerminalInfo["RRC"] = new string[][] { new string[] { "left", "RRC" }, new string[] { "right", "VP", "ADJP", "JJP", "NP", "PP", "ADVP" } };
     // delete IN -- go for main part of sentence; add FRAG
     nonTerminalInfo["S"]     = new string[][] { new string[] { "left", "TO", "VP", "S", "FRAG", "SBAR", "ADJP", "JJP", "UCP", "NP" } };
     nonTerminalInfo["SBAR"]  = new string[][] { new string[] { "left", "WHNP", "WHPP", "WHADVP", "WHADJP", "IN", "DT", "S", "SQ", "SINV", "SBAR", "FRAG" } };
     nonTerminalInfo["SBARQ"] = new string[][] { new string[] { "left", "SQ", "S", "SINV", "SBARQ", "FRAG", "SBAR" } };
     // cdm: if you have 2 VP under an SINV, you should really take the 2nd as syntactic head, because the first is a topicalized VP complement of the second, but for now I didn't change this, since it didn't help parsing.  (If it were changed, it'd need to be also changed to the opposite in SemanticHeadFinder.)
     nonTerminalInfo["SINV"] = new string[][] { new string[] { "left", "VBZ", "VBD", "VBP", "VB", "MD", "VBN", "VP", "S", "SINV", "ADJP", "JJP", "NP" } };
     nonTerminalInfo["SQ"]   = new string[][] { new string[] { "left", "VBZ", "VBD", "VBP", "VB", "MD", "AUX", "AUXG", "VP", "SQ" } };
     // TODO: Should maybe put S before SQ for tag questions. Check.
     nonTerminalInfo["UCP"] = new string[][] { new string[] { "right" } };
     // below is weird!! Make 2 lists, one for good and one for bad heads??
     // VP: added AUX and AUXG to work with Charniak tags
     nonTerminalInfo["VP"]     = new string[][] { new string[] { "left", "TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "VP", "AUX", "AUXG", "ADJP", "JJP", "NN", "NNS", "JJ", "NP", "NNP" } };
     nonTerminalInfo["WHADJP"] = new string[][] { new string[] { "left", "WRB", "WHADVP", "RB", "JJ", "ADJP", "JJP", "JJR" } };
     nonTerminalInfo["WHADVP"] = new string[][] { new string[] { "right", "WRB", "WHADVP" } };
     nonTerminalInfo["WHNP"]   = new string[][] { new string[] { "left", "WDT", "WP", "WP$", "WHADJP", "WHPP", "WHNP" } };
     nonTerminalInfo["WHPP"]   = new string[][] { new string[] { "right", "IN", "TO", "FW" } };
     nonTerminalInfo["X"]      = new string[][] { new string[] { "right", "S", "VP", "ADJP", "JJP", "NP", "SBAR", "PP", "X" } };
     nonTerminalInfo["NP"]     = new string[][] { new string[] { "rightdis", "NN", "NNP", "NNPS", "NNS", "NML", "NX", "POS", "JJR" }, new string[] { "left", "NP", "PRP" }, new string[] { "rightdis", "$", "ADJP", "JJP", "PRN", "FW" }, new string[] { "right"
                                                                                                                                                                                                                                                         , "CD" }, new string[] { "rightdis", "JJ", "JJS", "RB", "QP", "DT", "WDT", "RBR", "ADVP" } };
     nonTerminalInfo["NX"] = nonTerminalInfo["NP"];
     // TODO: seems JJ should be head of NML in this case:
     // (NP (NML (JJ former) (NML Red Sox) (JJ great)) (NNP Luis) (NNP Tiant)),
     // (although JJ great is tagged wrong)
     nonTerminalInfo["NML"]   = nonTerminalInfo["NP"];
     nonTerminalInfo["POSSP"] = new string[][] { new string[] { "right", "POS" } };
     /* HJT: Adding the following to deal with oddly formed data in (for example) the Brown corpus */
     nonTerminalInfo["ROOT"] = new string[][] { new string[] { "left", "S", "SQ", "SINV", "SBAR", "FRAG" } };
     // Just to handle trees which have TOP instead of ROOT at the root
     nonTerminalInfo["TOP"]  = nonTerminalInfo["ROOT"];
     nonTerminalInfo["TYPO"] = new string[][] { new string[] { "left", "NN", "NP", "NML", "NNP", "NNPS", "TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "VP", "ADJP", "JJP", "FRAG" } };
     // for Brown (Roger)
     nonTerminalInfo["ADV"] = new string[][] { new string[] { "right", "RB", "RBR", "RBS", "FW", "ADVP", "TO", "CD", "JJR", "JJ", "IN", "NP", "NML", "JJS", "NN" } };
     // SWBD
     nonTerminalInfo["EDITED"] = new string[][] { new string[] { "left" } };
     // crap rule for Switchboard (if don't delete EDITED nodes)
     // in sw2756, a "VB". (copy "VP" to handle this problem, though should really fix it on reading)
     nonTerminalInfo["VB"]   = new string[][] { new string[] { "left", "TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP", "VP", "AUX", "AUXG", "ADJP", "JJP", "NN", "NNS", "JJ", "NP", "NNP" } };
     nonTerminalInfo["META"] = new string[][] { new string[] { "left" } };
     // rule for OntoNotes, but maybe should just be deleted in TreeReader??
     nonTerminalInfo["XS"] = new string[][] { new string[] { "right", "IN" } };
 }