public BaseLexicon(Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) { // protected transient Set<IntTaggedWord> rules = new // HashSet<IntTaggedWord>(); // When it existed, rules somehow held a few less things than rulesWithWord // I never figured out why [cdm, Dec 2004] // protected transient Set<IntTaggedWord> sigs=Generics.newHashSet(); // these next two are used for smartMutation calculation // = null; // = null; this.wordIndex = wordIndex; this.tagIndex = tagIndex; flexiTag = op.lexOptions.flexiTag; useSignatureForKnownSmoothing = op.lexOptions.useSignatureForKnownSmoothing; this.smoothInUnknownsThreshold = op.lexOptions.smoothInUnknownsThreshold; this.smartMutation = op.lexOptions.smartMutation; this.trainOptions = op.trainOptions; this.testOptions = op.testOptions; this.op = op; // Construct UnknownWordModel by reflection -- a right pain // Lexicons and UnknownWordModels aren't very well encapsulated // from each other! if (op.lexOptions.uwModelTrainer == null) { this.uwModelTrainerClass = "edu.stanford.nlp.parser.lexparser.BaseUnknownWordModelTrainer"; } else { this.uwModelTrainerClass = op.lexOptions.uwModelTrainer; } }
public TreeAnnotator(IHeadFinder hf, ITreebankLangParserParams tlpp, Options op) { this.tlpParams = tlpp; this.hf = hf; this.tf = new LabeledScoredTreeFactory(); this.trainOptions = op.trainOptions; }
public LinearGrammarSmoother(TrainOptions trainOptions, IIndex <string> stateIndex, IIndex <string> tagIndex) { annoteChars = Generics.NewHashSet(Arrays.AsList(annotationIntroducingChars)); // private static final String SYNTH_NODE_MARK = "@"; // // private static final Pattern pContext = Pattern.compile("(\\|.+)$"); // Do not include @ in this list! @ marks synthetic nodes! // Stole these from PennTreebankLanguagePack this.trainOptions = trainOptions; this.stateIndex = stateIndex; this.tagIndex = tagIndex; }
/// <summary> /// The tree t is normally expected to be a Penn-Treebank-style tree /// in which the top node is an extra node that has a unary expansion. /// </summary> /// <remarks> /// The tree t is normally expected to be a Penn-Treebank-style tree /// in which the top node is an extra node that has a unary expansion. /// If this isn't the case, an extra node is added and the user is warned. /// </remarks> public virtual Tree TransformTree(Tree t) { if (trainOptions.printTreeTransformations > 0) { TrainOptions.PrintTrainTree(null, "ORIGINAL TREE:", t); } Tree trTree = annotator.TransformTree(t); if (trainOptions.selectivePostSplit) { trTree = postSplitter.TransformTree(trTree); } if (trainOptions.printTreeTransformations > 0) { TrainOptions.PrintTrainTree(trainOptions.printAnnotatedPW, "ANNOTATED TREE:", trTree); } if (trainOptions.printAnnotatedRuleCounts) { Tree tr2 = trTree.DeepCopy(new LabeledScoredTreeFactory(), new StringLabelFactory()); ICollection <Tree> localTrees = tr2.LocalTrees(); foreach (Tree tr in localTrees) { annotatedRuleCounts.IncrementCount(tr); } } if (trainOptions.printAnnotatedStateCounts) { foreach (Tree subt in trTree) { if (!subt.IsLeaf()) { annotatedStateCounts.IncrementCount(subt.Label().Value()); } } } // if we add the ROOT first, then we don't know how to percolate the heads at the top AddRoot(trTree); // this creates a few non-binarized rules at the top Tree binarizedTree = binarizer.TransformTree(trTree); if (trainOptions.printTreeTransformations > 0) { TrainOptions.PrintTrainTree(trainOptions.printBinarizedPW, "BINARIZED TREE:", binarizedTree); trainOptions.printTreeTransformations--; } if (forceCNF) { binarizedTree = new CNFTransformers.ToCNFTransformer().TransformTree(binarizedTree); } // System.out.println("BinarizedCNF:\n"); // binarizedTree.pennPrint(); return(binarizedTree); }
public TreeAnnotatorAndBinarizer(IHeadFinder annotationHF, IHeadFinder binarizationHF, ITreebankLangParserParams tlpParams, bool forceCNF, bool insideFactor, bool doSubcategorization, Options op) { this.trainOptions = op.trainOptions; if (doSubcategorization) { annotator = new TreeAnnotator(annotationHF, tlpParams, op); } else { annotator = new TreeAnnotatorAndBinarizer.TreeNullAnnotator(annotationHF); } binarizer = new TreeBinarizer(binarizationHF, tlpParams.TreebankLanguagePack(), insideFactor, trainOptions.markovFactor, trainOptions.markovOrder, trainOptions.CompactGrammar() > 0, trainOptions.CompactGrammar() > 1, trainOptions.HselCut, trainOptions .markFinalStates, trainOptions.simpleBinarizedLabels, trainOptions.noRebinarization); if (trainOptions.selectivePostSplit) { postSplitter = new PostSplitter(tlpParams, op); } else { postSplitter = null; } this.tf = new LabeledScoredTreeFactory(new CategoryWordTagFactory()); this.tlp = tlpParams.TreebankLanguagePack(); this.forceCNF = forceCNF; if (trainOptions.printAnnotatedRuleCounts) { annotatedRuleCounts = new ClassicCounter <Tree>(); } else { annotatedRuleCounts = null; } if (trainOptions.printAnnotatedStateCounts) { annotatedStateCounts = new ClassicCounter <string>(); } else { annotatedStateCounts = null; } }
public BaseUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, ClassicCounter <IntTaggedWord> unSeenCounter, IDictionary <ILabel, ClassicCounter <string> > tagHash, IDictionary <string, float> unknownGT, ICollection <string> seenEnd) { //= true; // Only care if first is capitalized // only used if useEnd==true endLength = op.lexOptions.unknownSuffixSize; // TODO: refactor these terms into BaseUnknownWordModelTrainer useEnd = (op.lexOptions.unknownSuffixSize > 0 && op.lexOptions.useUnknownWordSignatures > 0); useFirstCap = op.lexOptions.useUnknownWordSignatures > 0; useGT = (op.lexOptions.useUnknownWordSignatures == 0); useFirst = false; this.lex = lex; this.trainOptions = op.trainOptions; this.wordIndex = wordIndex; this.tagIndex = tagIndex; this.unSeenCounter = unSeenCounter; this.tagHash = tagHash; this.seenEnd = seenEnd; this.unknownGT = unknownGT; unknownLevel = op.lexOptions.useUnknownWordSignatures; }
public PostSplitter(ITreebankLangParserParams tlpParams, Options op) { this.tlpParams = tlpParams; this.hf = tlpParams.HeadFinder(); this.trainOptions = op.trainOptions; }