private void TestOnTreebank(LexicalizedParser pd, ITreebankLangParserParams tlpParams, Treebank testTreebank, string treebankRoot, IIndex <string> stateIndex) { Timing.StartTime(); ITreeTransformer annotator = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); // CDM: Aug 2004: With new implementation of treebank split categories, // I've hardwired this to load English ones. Otherwise need training data. // op.trainOptions.splitters = new HashSet(Arrays.asList(op.tlpParams.splitters())); op.trainOptions.splitters = ParentAnnotationStats.GetEnglishSplitCategories(treebankRoot); op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(op.tlpParams.SisterSplitters())); foreach (Tree goldTree in testTreebank) { goldTree = annotator.TransformTree(goldTree); // System.out.println(); // System.out.println("Checking tree: " + goldTree); foreach (Tree localTree in goldTree) { // now try to use the grammar to score this local tree if (localTree.IsLeaf() || localTree.IsPreTerminal() || localTree.Children().Length < 2) { continue; } System.Console.Out.WriteLine(LocalTreeToRule(localTree)); double score = ComputeLocalTreeScore(localTree, stateIndex, pd); if (score == double.NegativeInfinity) { } // System.out.println(localTreeToRule(localTree)); System.Console.Out.WriteLine("score: " + score); } } }
public ChineseSimWordAvgDepGrammar(ITreebankLangParserParams tlpParams, bool directional, bool distance, bool coarseDistance, bool basicCategoryTagsInDependencyGrammar, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) : base(tlpParams, directional, distance, coarseDistance, basicCategoryTagsInDependencyGrammar, op, wordIndex, tagIndex) { //private static final double MIN_PROBABILITY = Math.exp(-100.0); simHeadMap = GetMap(headArgFile); simArgMap = GetMap(argHeadFile); }
public BinarizerAnnotator(string annotatorName, Properties props) { this.tlppClass = props.GetProperty(annotatorName + ".tlppClass", DefaultTlppClass); ITreebankLangParserParams tlpp = ReflectionLoading.LoadByReflection(tlppClass); this.binarizer = TreeBinarizer.SimpleTreeBinarizer(tlpp.HeadFinder(), tlpp.TreebankLanguagePack()); }
public TreeAnnotator(IHeadFinder hf, ITreebankLangParserParams tlpp, Options op) { this.tlpParams = tlpp; this.hf = hf; this.tf = new LabeledScoredTreeFactory(); this.trainOptions = op.trainOptions; }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, ArgDefs()); Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); ITreebankLangParserParams tlpp = language.@params; DiskTreebank tb = null; string encoding = options.GetProperty("l", "UTF-8"); bool removeBracket = PropertiesUtils.GetBool(options, "b", false); tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); tb = tlpp.DiskTreebank(); string[] files = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (files.Length != 0) { foreach (string filename in files) { tb.LoadPath(filename); } } else { log.Info(Usage()); System.Environment.Exit(-1); } PrintWriter pwo = tlpp.Pw(); string startSymbol = tlpp.TreebankLanguagePack().StartSymbol(); ITreeFactory tf = new LabeledScoredTreeFactory(); int nTrees = 0; foreach (Tree t in tb) { if (removeBracket) { if (t.Value().Equals(startSymbol)) { t = t.FirstChild(); } } else { if (!t.Value().Equals(startSymbol)) { //Add a bracket if it isn't already there t = tf.NewTreeNode(startSymbol, Java.Util.Collections.SingletonList(t)); } } pwo.Println(t.ToString()); nTrees++; } pwo.Close(); System.Console.Error.Printf("Processed %d trees.%n", nTrees); }
public ChineseMaxentLexicon(Options op, IIndex <string> wordIndex, IIndex <string> tagIndex, int featureLevel) { this.op = op; this.tlpParams = op.tlpParams; this.ctlp = op.tlpParams.TreebankLanguagePack(); this.wordIndex = wordIndex; this.tagIndex = tagIndex; this.featureLevel = featureLevel; }
public MLEDependencyGrammarExtractor(Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) : base(op) { //private Set dependencies = new HashSet(); this.wordIndex = wordIndex; this.tagIndex = tagIndex; tlpParams = op.tlpParams; directional = op.directional; useDistance = op.distance; useCoarseDistance = op.coarseDistance; basicCategoryTagsInDependencyGrammar = op.trainOptions.basicCategoryTagsInDependencyGrammar; }
public virtual void InitFiles(ITreebankLangParserParams tlpParams, string testFilename) { try { goldWriter = null; testWriter = tlpParams.Pw(new FileOutputStream(testFilename)); } catch (IOException e) { throw new Exception(e); } count = 0; }
public MLEDependencyGrammar(ITagProjection tagProjection, ITreebankLangParserParams tlpParams, bool directional, bool useDistance, bool useCoarseDistance, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) : base(tlpParams.TreebankLanguagePack(), tagProjection, directional, useDistance, useCoarseDistance, op, wordIndex, tagIndex) { // reduced tag space // public double distanceDecay = 0.0; // extra smoothing hyperparameters for tag projection backoff. Only used if useSmoothTagProjection is true. // back off Bayesian m-estimate of aTW given aT to aPTW given aPT // back off Bayesian m-estimate of aTW_hTd to aPTW_hPTd (?? guessed, not tuned) // back off Bayesian m-estimate of aT_hTd to aPT_hPTd (?? guessed, not tuned) // back off word prediction from tag to projected tag (only used if useUnigramWordSmoothing is true) useSmoothTagProjection = op.useSmoothTagProjection; useUnigramWordSmoothing = op.useUnigramWordSmoothing; argCounter = new ClassicCounter <IntDependency>(); stopCounter = new ClassicCounter <IntDependency>(); double[] smoothParams = tlpParams.MLEDependencyGrammarSmoothingParams(); smooth_aT_hTWd = smoothParams[0]; smooth_aTW_hTWd = smoothParams[1]; smooth_stop = smoothParams[2]; interp = smoothParams[3]; // cdm added Jan 2007 to play with dep grammar smoothing. Integrate this better if we keep it! smoothTP = new BasicCategoryTagProjection(tlpParams.TreebankLanguagePack()); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length < MinArgs) { log.Info(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, OptArgDefs()); string splitPrefix = options.GetProperty("s", null); bool ShowWords = PropertiesUtils.GetBool(options, "w", false); bool pathsAreFiles = PropertiesUtils.GetBool(options, "f", false); bool ShowOov = PropertiesUtils.GetBool(options, "o", false); string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (parsedArgs.Length != MinArgs) { log.Info(Usage()); System.Environment.Exit(-1); } Language language = Language.ValueOf(parsedArgs[0]); IList <string> corpusPaths = new List <string>(parsedArgs.Length - 1); for (int i = 1; i < parsedArgs.Length; ++i) { corpusPaths.Add(parsedArgs[i]); } ITreebankLangParserParams tlpp = language.@params; TreebankStats cs = new TreebankStats(language, corpusPaths, tlpp); if (splitPrefix != null) { if (!cs.UseSplit(splitPrefix)) { log.Info("Could not load split!"); } } cs.Run(pathsAreFiles, ShowWords, ShowOov); }
public NegraPennCollinizer(ITreebankLangParserParams tlpp, bool deletePunct) { this.tlpp = tlpp; this.deletePunct = deletePunct; }
public NegraPennCollinizer(ITreebankLangParserParams tlpp) : this(tlpp, true) { }
/// <summary> /// This method lets you train and test a segmenter relative to a /// Treebank. /// </summary> /// <remarks> /// This method lets you train and test a segmenter relative to a /// Treebank. /// <p> /// <i>Implementation note:</i> This method is largely cloned from /// LexicalizedParser's main method. Should we try to have it be able /// to train segmenters to stop things going out of sync? /// </remarks> public static void Main(string[] args) { bool train = false; bool saveToSerializedFile = false; bool saveToTextFile = false; string serializedInputFileOrUrl = null; string textInputFileOrUrl = null; string serializedOutputFileOrUrl = null; string textOutputFileOrUrl = null; string treebankPath = null; Treebank testTreebank = null; // Treebank tuneTreebank = null; string testPath = null; IFileFilter testFilter = null; IFileFilter trainFilter = null; string encoding = null; // variables needed to process the files to be parsed ITokenizerFactory <Word> tokenizerFactory = null; // DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(); bool tokenized = false; // whether or not the input file has already been tokenized IFunction <IList <IHasWord>, IList <IHasWord> > escaper = new ChineseEscaper(); // int tagDelimiter = -1; // String sentenceDelimiter = "\n"; // boolean fromXML = false; int argIndex = 0; if (args.Length < 1) { log.Info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*"); return; } Options op = new Options(); op.tlpParams = new ChineseTreebankParserParams(); // while loop through option arguments while (argIndex < args.Length && args[argIndex][0] == '-') { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train")) { train = true; saveToSerializedFile = true; int numSubArgs = NumSubArgs(args, argIndex); argIndex++; if (numSubArgs > 1) { treebankPath = args[argIndex]; argIndex++; } else { throw new Exception("Error: -train option must have treebankPath as first argument."); } if (numSubArgs == 2) { trainFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs >= 3) { try { int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? trainFilter = new NumberRangesFileFilter(args[argIndex], true); argIndex++; } } } } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-encoding")) { // sets encoding for TreebankLangParserParams encoding = args[argIndex + 1]; op.tlpParams.SetInputEncoding(encoding); op.tlpParams.SetOutputEncoding(encoding); argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-loadFromSerializedFile")) { // load the parser from a binary serialized file // the next argument must be the path to the parser file serializedInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { // doesn't make sense to load from TextFile -pichuan // } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) { // // load the parser from declarative text file // // the next argument must be the path to the parser file // textInputFileOrUrl = args[argIndex + 1]; // argIndex += 2; if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToSerializedFile")) { saveToSerializedFile = true; serializedOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToTextFile")) { // save the parser to declarative text file saveToTextFile = true; textOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")) { // the next argument is the treebank path and range for testing int numSubArgs = NumSubArgs(args, argIndex); argIndex++; if (numSubArgs == 1) { testFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs > 1) { testPath = args[argIndex++]; if (numSubArgs == 2) { testFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs >= 3) { try { int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); testFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? testFilter = new NumberRangesFileFilter(args[argIndex++], true); } } } } } } else { int j = op.tlpParams.SetOptionFlag(args, argIndex); if (j == argIndex) { log.Info("Unknown option ignored: " + args[argIndex]); j++; } argIndex = j; } } } } } } } // end while loop through arguments ITreebankLangParserParams tlpParams = op.tlpParams; // all other arguments are order dependent and // are processed in order below Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null; if (!train && op.testOptions.verbose) { System.Console.Out.WriteLine("Currently " + new DateTime()); PrintArgs(args, System.Console.Out); } if (train) { PrintArgs(args, System.Console.Out); // so we train a parser using the treebank if (treebankPath == null) { // the next arg must be the treebank path, since it wasn't give earlier treebankPath = args[argIndex]; argIndex++; if (args.Length > argIndex + 1) { try { // the next two args might be the range int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? trainFilter = new NumberRangesFileFilter(args[argIndex], true); argIndex++; } } } Treebank trainTreebank = MakeTreebank(treebankPath, op, trainFilter); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex); } else { if (textInputFileOrUrl != null) { } else { // so we load the segmenter from a text grammar file // XXXXX fix later -pichuan //cs = new LexicalizedParser(textInputFileOrUrl, true, op); // so we load a serialized segmenter if (serializedInputFileOrUrl == null) { // the next argument must be the path to the serialized parser serializedInputFileOrUrl = args[argIndex]; argIndex++; } try { cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op); } catch (ArgumentException) { log.Info("Error loading segmenter, exiting..."); System.Environment.Exit(0); } } } // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test TreePrint treePrint = op.testOptions.TreePrint(tlpParams); if (testFilter != null) { if (testPath == null) { if (treebankPath == null) { throw new Exception("No test treebank path specified..."); } else { log.Info("No test treebank path specified. Using train path: \"" + treebankPath + "\""); testPath = treebankPath; } } testTreebank = tlpParams.TestMemoryTreebank(); testTreebank.LoadPath(testPath, testFilter); } op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(tlpParams.SisterSplitters())); // at this point we should be sure that op.tlpParams is // set appropriately (from command line, or from grammar file), // and will never change again. We also set the tlpParams of the // LexicalizedParser instance to be the same object. This is // redundancy that we probably should take out eventually. // // -- Roger if (op.testOptions.verbose) { log.Info("Lexicon is " + cs.GetType().FullName); } PrintWriter pwOut = tlpParams.Pw(); PrintWriter pwErr = tlpParams.Pw(System.Console.Error); // Now what do we do with the parser we've made if (saveToTextFile) { // save the parser to textGrammar format if (textOutputFileOrUrl != null) { SaveSegmenterDataToText(cs, textOutputFileOrUrl); } else { log.Info("Usage: must specify a text segmenter data output path"); } } if (saveToSerializedFile) { if (serializedOutputFileOrUrl == null && argIndex < args.Length) { // the next argument must be the path to serialize to serializedOutputFileOrUrl = args[argIndex]; argIndex++; } if (serializedOutputFileOrUrl != null) { SaveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl); } else { if (textOutputFileOrUrl == null && testTreebank == null) { // no saving/parsing request has been specified log.Info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename"); } } } /* --------------------- Testing part!!!! ----------------------- */ if (op.testOptions.verbose) { } // printOptions(false, op); if (testTreebank != null || (argIndex < args.Length && Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))) { // test parser on treebank if (testTreebank == null) { // the next argument is the treebank path and range for testing testTreebank = tlpParams.TestMemoryTreebank(); if (args.Length < argIndex + 4) { testTreebank.LoadPath(args[argIndex + 1]); } else { int testlow = System.Convert.ToInt32(args[argIndex + 2]); int testhigh = System.Convert.ToInt32(args[argIndex + 3]); testTreebank.LoadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true)); } } } }
private static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromTreebank(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) { System.Console.Out.WriteLine("Currently " + new DateTime()); // printOptions(true, op); Timing.StartTime(); // setup tree transforms ITreebankLangParserParams tlpParams = op.tlpParams; if (op.testOptions.verbose) { System.Console.Out.Write("Training "); System.Console.Out.WriteLine(trainTreebank.TextualSummary()); } System.Console.Out.Write("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer; // initialized below if (!op.trainOptions.leftToRight) { binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); } else { binarizer = new TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); } CollinsPuncTransformer collinsPuncTransformer = null; if (op.trainOptions.collinsPunc) { collinsPuncTransformer = new CollinsPuncTransformer(tlpParams.TreebankLanguagePack()); } IList <Tree> binaryTrainTrees = new List <Tree>(); // List<Tree> binaryTuneTrees = new ArrayList<Tree>(); if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, true, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlpParams.TreebankLanguagePack()); if (op.testOptions.verbose) { log.Info("Parent split categories: " + op.trainOptions.splitters); } } if (op.trainOptions.selectivePostSplit) { ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); Treebank annotatedTB = trainTreebank.Transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlpParams.TreebankLanguagePack()); if (op.testOptions.verbose) { log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in trainTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.TransformTree(tree); } tree = binarizer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); } foreach (Tree tree_1 in trainTreebank) { if (op.trainOptions.collinsPunc) { tree_1 = collinsPuncTransformer.TransformTree(tree_1); } tree_1 = binarizer.TransformTree(tree_1); binaryTrainTrees.Add(tree_1); } Timing.Tick("done."); if (op.testOptions.verbose) { binarizer.DumpStats(); } System.Console.Out.Write("Extracting Lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter clex = (Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter)op.tlpParams.Lex(op, wordIndex, tagIndex); clex.InitializeTraining(binaryTrainTrees.Count); clex.Train(binaryTrainTrees); clex.FinishTraining(); Timing.Tick("done."); return(clex); }
internal Language(ITreebankLangParserParams @params) { this.@params = @params; }
public TreebankStats(Language langName, IList <string> paths, ITreebankLangParserParams tlpp) { languageName = langName; pathNames = paths; this.tlpp = tlpp; }
// initial value is -0xDEADBEEF (actually positive because of 2s complement) // Don't change this; set with -v /// <summary>Determines method for print trees on output.</summary> /// <param name="tlpParams">The treebank parser params</param> /// <returns>A suitable tree printing object</returns> public virtual Edu.Stanford.Nlp.Trees.TreePrint TreePrint(ITreebankLangParserParams tlpParams) { ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack(); return new Edu.Stanford.Nlp.Trees.TreePrint(outputFormat, outputFormatOptions, tlp, tlpParams.HeadFinder(), tlpParams.TypedDependencyHeadFinder()); }
/// <summary>Run the Evalb scoring metric on guess/gold input.</summary> /// <remarks>Run the Evalb scoring metric on guess/gold input. The default language is English.</remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length < minArgs) { log.Info(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, OptionArgDefs()); Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); ITreebankLangParserParams tlpp = language.@params; int maxGoldYield = PropertiesUtils.GetInt(options, "y", int.MaxValue); bool Verbose = PropertiesUtils.GetBool(options, "v", false); bool sortByF1 = PropertiesUtils.HasProperty(options, "s"); int worstKTreesToEmit = PropertiesUtils.GetInt(options, "s", 0); PriorityQueue <Triple <double, Tree, Tree> > queue = sortByF1 ? new PriorityQueue <Triple <double, Tree, Tree> >(2000, new Evalb.F1Comparator()) : null; bool doCatLevel = PropertiesUtils.GetBool(options, "c", false); string labelRegex = options.GetProperty("f", null); string encoding = options.GetProperty("e", "UTF-8"); string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (parsedArgs.Length != minArgs) { log.Info(Usage()); System.Environment.Exit(-1); } string goldFile = parsedArgs[0]; string guessFile = parsedArgs[1]; // Command-line has been parsed. Configure the metric for evaluation. tlpp.SetInputEncoding(encoding); PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); Evalb metric = new Evalb("Evalb LP/LR", true); EvalbByCat evalbCat = (doCatLevel) ? new EvalbByCat("EvalbByCat LP/LR", true, labelRegex) : null; ITreeTransformer tc = tlpp.Collinizer(); //The evalb ref implementation assigns status for each tree pair as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator(); int goldLineId = 0; int guessLineId = 0; int skippedGuessTrees = 0; while (guessItr.MoveNext() && goldItr.MoveNext()) { Tree guessTree = guessItr.Current; IList <ILabel> guessYield = guessTree.Yield(); guessLineId++; Tree goldTree = goldItr.Current; IList <ILabel> goldYield = goldTree.Yield(); goldLineId++; // Check that we should evaluate this tree if (goldYield.Count > maxGoldYield) { skippedGuessTrees++; continue; } // Only trees with equal yields can be evaluated if (goldYield.Count != guessYield.Count) { pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId); skippedGuessTrees++; continue; } Tree evalGuess = tc.TransformTree(guessTree); Tree evalGold = tc.TransformTree(goldTree); metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); if (doCatLevel) { evalbCat.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); } if (sortByF1) { StoreTrees(queue, guessTree, goldTree, metric.GetLastF1()); } } if (guessItr.MoveNext() || goldItr.MoveNext()) { System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId); } pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees); } metric.Display(true, pwOut); pwOut.Println(); if (doCatLevel) { evalbCat.Display(true, pwOut); pwOut.Println(); } if (sortByF1) { EmitSortedTrees(queue, worstKTreesToEmit, guessFile); } pwOut.Close(); }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage); System.Environment.Exit(-1); } // Process command-line options Properties options = StringUtils.ArgsToProperties(args, optionArgDefinitions); string fileName = options.GetProperty(string.Empty); if (fileName == null || fileName.Equals(string.Empty)) { System.Console.Out.WriteLine(usage); System.Environment.Exit(-1); } Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); ITreebankLangParserParams tlpp = language.@params; string encoding = options.GetProperty("e", "UTF-8"); tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); DiskTreebank tb = tlpp.DiskTreebank(); tb.LoadPath(fileName); // Statistics ICounter <string> binaryRuleTypes = new ClassicCounter <string>(20000); IList <int> branchingFactors = new List <int>(20000); int nTrees = 0; int nUnaryRules = 0; int nBinaryRules = 0; int binaryBranchingFactors = 0; // Read the treebank PrintWriter pw = tlpp.Pw(); foreach (Tree tree in tb) { if (tree.Value().Equals("ROOT")) { tree = tree.FirstChild(); } ++nTrees; foreach (Tree subTree in tree) { if (subTree.IsPhrasal()) { if (subTree.NumChildren() > 1) { ++nBinaryRules; branchingFactors.Add(subTree.NumChildren()); binaryBranchingFactors += subTree.NumChildren(); binaryRuleTypes.IncrementCount(TreeToRuleString(subTree)); } else { ++nUnaryRules; } } } } double mean = (double)binaryBranchingFactors / (double)nBinaryRules; System.Console.Out.Printf("#trees:\t%d%n", nTrees); System.Console.Out.Printf("#binary:\t%d%n", nBinaryRules); System.Console.Out.Printf("#binary types:\t%d%n", binaryRuleTypes.KeySet().Count); System.Console.Out.Printf("mean branching:\t%.4f%n", mean); System.Console.Out.Printf("stddev branching:\t%.4f%n", StandardDeviation(branchingFactors, mean)); System.Console.Out.Printf("rule entropy:\t%.5f%n", Counters.Entropy(binaryRuleTypes)); System.Console.Out.Printf("#unaries:\t%d%n", nUnaryRules); }
public static void InitEVALBfiles(ITreebankLangParserParams tlpParams) { DefaultWriter.InitFiles(tlpParams, DefaultGoldFilename, DefaultTestFilename); }
/// <returns>A Triple of binaryTrainTreebank, binarySecondaryTreebank, binaryTuneTreebank.</returns> public static Triple <Treebank, Treebank, Treebank> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank, Treebank secondaryTreebank, Treebank tuneTreebank, Options op) { // setup tree transforms ITreebankLangParserParams tlpParams = op.tlpParams; ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack(); if (op.testOptions.verbose) { PrintWriter pwErr = tlpParams.Pw(System.Console.Error); pwErr.Print("Training "); pwErr.Println(trainTreebank.TextualSummary(tlp)); if (secondaryTreebank != null) { pwErr.Print("Secondary training "); pwErr.Println(secondaryTreebank.TextualSummary(tlp)); } } CompositeTreeTransformer trainTransformer = new CompositeTreeTransformer(); if (op.trainOptions.preTransformer != null) { trainTransformer.AddTransformer(op.trainOptions.preTransformer); } if (op.trainOptions.collinsPunc) { CollinsPuncTransformer collinsPuncTransformer = new CollinsPuncTransformer(tlp); trainTransformer.AddTransformer(collinsPuncTransformer); } log.Info("Binarizing trees..."); Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer binarizer; if (!op.trainOptions.leftToRight) { binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op); } else { binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op); } trainTransformer.AddTransformer(binarizer); if (op.wordFunction != null) { ITreeTransformer wordFunctionTransformer = new TreeLeafLabelTransformer(op.wordFunction); trainTransformer.AddTransformer(wordFunctionTransformer); } Treebank wholeTreebank; if (secondaryTreebank == null) { wholeTreebank = trainTreebank; } else { wholeTreebank = new CompositeTreebank(trainTreebank, secondaryTreebank); } if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp); RemoveDeleteSplittersFromSplitters(tlp, op); if (op.testOptions.verbose) { IList <string> list = new List <string>(op.trainOptions.splitters); list.Sort(); log.Info("Parent split categories: " + list); } } if (op.trainOptions.selectivePostSplit) { // Do all the transformations once just to learn selective splits on annotated categories ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); wholeTreebank = wholeTreebank.Transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp); if (op.testOptions.verbose) { log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { // We run through all the trees once just to gather counts for hSelSplit! int ptt = op.trainOptions.printTreeTransformations; op.trainOptions.printTreeTransformations = 0; binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in wholeTreebank) { trainTransformer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); op.trainOptions.printTreeTransformations = ptt; } // we've done all the setup now. here's where the train treebank is transformed. trainTreebank = trainTreebank.Transform(trainTransformer); if (secondaryTreebank != null) { secondaryTreebank = secondaryTreebank.Transform(trainTransformer); } if (op.trainOptions.printAnnotatedStateCounts) { binarizer.PrintStateCounts(); } if (op.trainOptions.printAnnotatedRuleCounts) { binarizer.PrintRuleCounts(); } if (tuneTreebank != null) { tuneTreebank = tuneTreebank.Transform(trainTransformer); } if (op.testOptions.verbose) { binarizer.DumpStats(); } return(new Triple <Treebank, Treebank, Treebank>(trainTreebank, secondaryTreebank, tuneTreebank)); }
public virtual IList <Tree> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank) { ITreebankLangParserParams tlpParams = op.tlpParams; ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack(); if (Verbose) { log.Info("\n\n" + trainTreebank.TextualSummary(tlp)); } log.Info("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); Timing.Tick("done."); if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp); RemoveDeleteSplittersFromSplitters(tlp); if (op.testOptions.verbose) { IList <string> list = new List <string>(op.trainOptions.splitters); list.Sort(); log.Info("Parent split categories: " + list); } } // if (op.trainOptions.selectivePostSplit) { // // Do all the transformations once just to learn selective splits on annotated categories // TreeTransformer myTransformer = new TreeAnnotator(tlpParams.headFinder(), tlpParams); // Treebank annotatedTB = trainTreebank.transform(myTransformer); // op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp); // if (op.testOptions.verbose) { // log.info("Parent post annotation split categories: " + op.trainOptions.postSplitters); // } // } if (op.trainOptions.hSelSplit) { // We run through all the trees once just to gather counts for hSelSplit! int ptt = op.trainOptions.printTreeTransformations; op.trainOptions.printTreeTransformations = 0; binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in trainTreebank) { binarizer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); op.trainOptions.printTreeTransformations = ptt; } //Tree transformation // IList <Tree> binaryTrainTrees = new List <Tree>(); foreach (Tree tree_1 in trainTreebank) { tree_1 = binarizer.TransformTree(tree_1); if (tree_1.Yield().Count - 1 <= trainLengthLimit) { binaryTrainTrees.Add(tree_1); } } // WSGDEBUG: Lot's of stuff on the grammar // if(VERBOSE) { // binarizer.printStateCounts(); // binarizer.printRuleCounts(); // binarizer.dumpStats(); // } return(binaryTrainTrees); }
public MLEDependencyGrammar(ITreebankLangParserParams tlpParams, bool directional, bool distance, bool coarseDistance, bool basicCategoryTagsInDependencyGrammar, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) : this(basicCategoryTagsInDependencyGrammar ? new BasicCategoryTagProjection(tlpParams.TreebankLanguagePack()) : new TestTagProjection(), tlpParams, directional, distance, coarseDistance, op, wordIndex, tagIndex) { }
/// <summary>Lets you test out the TreeBinarizer on the command line.</summary> /// <remarks> /// Lets you test out the TreeBinarizer on the command line. /// This main method doesn't yet handle as many flags as one would like. /// But it does have: /// <ul> /// <li> -tlp TreebankLanguagePack /// <li>-tlpp TreebankLangParserParams /// <li>-insideFactor /// <li>-markovOrder /// </ul> /// </remarks> /// <param name="args"> /// Command line arguments: flags as above, as above followed by /// treebankPath /// </param> public static void Main(string[] args) { ITreebankLangParserParams tlpp = null; // TreebankLangParserParams tlpp = new EnglishTreebankParserParams(); // TreeReaderFactory trf = new LabeledScoredTreeReaderFactory(); // Looks like it must build CategoryWordTagFactory!! ITreeReaderFactory trf = null; string fileExt = "mrg"; IHeadFinder hf = new ModCollinsHeadFinder(); ITreebankLanguagePack tlp = new PennTreebankLanguagePack(); bool insideFactor = false; bool mf = false; int mo = 1; bool uwl = false; bool uat = false; double sst = 20.0; bool mfs = false; bool simpleLabels = false; bool noRebinarization = false; int i = 0; while (i < args.Length && args[i].StartsWith("-")) { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlp") && i + 1 < args.Length) { try { tlp = (ITreebankLanguagePack)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1])); } catch (Exception e) { log.Info("Couldn't instantiate: " + args[i + 1]); throw new Exception(e); } i++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlpp") && i + 1 < args.Length) { try { tlpp = (ITreebankLangParserParams)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1])); } catch (Exception e) { log.Info("Couldn't instantiate: " + args[i + 1]); throw new Exception(e); } i++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-insideFactor")) { insideFactor = true; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-markovOrder") && i + 1 < args.Length) { i++; mo = System.Convert.ToInt32(args[i]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-simpleLabels")) { simpleLabels = true; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-noRebinarization")) { noRebinarization = true; } else { log.Info("Unknown option:" + args[i]); } } } } } } i++; } if (i >= args.Length) { log.Info("usage: java TreeBinarizer [-tlpp class|-markovOrder int|...] treebankPath"); System.Environment.Exit(0); } Treebank treebank; if (tlpp != null) { treebank = tlpp.MemoryTreebank(); tlp = tlpp.TreebankLanguagePack(); fileExt = tlp.TreebankFileExtension(); hf = tlpp.HeadFinder(); } else { treebank = new DiskTreebank(trf); } treebank.LoadPath(args[i], fileExt, true); ITreeTransformer tt = new Edu.Stanford.Nlp.Parser.Lexparser.TreeBinarizer(hf, tlp, insideFactor, mf, mo, uwl, uat, sst, mfs, simpleLabels, noRebinarization); foreach (Tree t in treebank) { Tree newT = tt.TransformTree(t); System.Console.Out.WriteLine("Original tree:"); t.PennPrint(); System.Console.Out.WriteLine("Binarized tree:"); newT.PennPrint(); System.Console.Out.WriteLine(); } }
/// <summary>Execute with no arguments for usage.</summary> public static void Main(string[] args) { if (!ValidateCommandLine(args)) { log.Info(Usage); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = Language.@params; PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); Edu.Stanford.Nlp.Parser.Metrics.LeafAncestorEval metric = new Edu.Stanford.Nlp.Parser.Metrics.LeafAncestorEval("LeafAncestor"); ITreeTransformer tc = tlpp.Collinizer(); //The evalb ref implementation assigns status for each tree pair as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator(); int goldLineId = 0; int guessLineId = 0; int skippedGuessTrees = 0; while (guessItr.MoveNext() && goldItr.MoveNext()) { Tree guessTree = guessItr.Current; IList <ILabel> guessYield = guessTree.Yield(); guessLineId++; Tree goldTree = goldItr.Current; IList <ILabel> goldYield = goldTree.Yield(); goldLineId++; // Check that we should evaluate this tree if (goldYield.Count > MaxGoldYield) { skippedGuessTrees++; continue; } // Only trees with equal yields can be evaluated if (goldYield.Count != guessYield.Count) { pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId); skippedGuessTrees++; continue; } Tree evalGuess = tc.TransformTree(guessTree); Tree evalGold = tc.TransformTree(goldTree); metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); } if (guessItr.MoveNext() || goldItr.MoveNext()) { System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId); } pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees%n", "Unable to evaluate", skippedGuessTrees); } metric.Display(true, pwOut); pwOut.Close(); }
private static IList <FactoredLexiconEvent> GetTuningSet(Treebank devTreebank, Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon, ITreebankLangParserParams tlpp) { IList <Tree> devTrees = new List <Tree>(3000); foreach (Tree tree in devTreebank) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } devTrees.Add(tree); } IList <FactoredLexiconEvent> tuningSet = TreebankToLexiconEvents(devTrees, lexicon); return(tuningSet); }
public TreeAnnotatorAndBinarizer(ITreebankLangParserParams tlpParams, bool forceCNF, bool insideFactor, bool doSubcategorization, Options op) : this(tlpParams.HeadFinder(), tlpParams.HeadFinder(), tlpParams, forceCNF, insideFactor, doSubcategorization, op) { }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 4) { System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName); System.Environment.Exit(-1); } // Command line options Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; Treebank trainTreebank = tlpp.DiskTreebank(); trainTreebank.LoadPath(args[2]); Treebank devTreebank = tlpp.DiskTreebank(); devTreebank.LoadPath(args[3]); MorphoFeatureSpecification morphoSpec; Options options = GetOptions(language); if (language.Equals(Language.Arabic)) { morphoSpec = new ArabicMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { if (language.Equals(Language.French)) { morphoSpec = new FrenchMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { throw new NotSupportedException(); } } string featureList = args[1]; string[] features = featureList.Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.WriteLine("Features: " + args[1]); // Create word and tag indices // Save trees in a collection since the interface requires that.... System.Console.Out.Write("Loading training trees..."); IList <Tree> trainTrees = new List <Tree>(19000); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); foreach (Tree tree in trainTreebank) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } trainTrees.Add(tree); } System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count); // Setup and train the lexicon. System.Console.Out.Write("Collecting sufficient statistics for lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex); lexicon.InitializeTraining(trainTrees.Count); lexicon.Train(trainTrees, null); lexicon.FinishTraining(); System.Console.Out.WriteLine("Done!"); trainTrees = null; // Load the tuning set System.Console.Out.Write("Loading tuning set..."); IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp); System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count); // Print the probabilities that we obtain // TODO(spenceg): Implement tagging accuracy with FactLex int nCorrect = 0; ICounter <string> errors = new ClassicCounter <string>(); foreach (FactoredLexiconEvent @event in tuningSet) { IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr()); ICounter <int> logScores = new ClassicCounter <int>(); bool noRules = true; int goldTagId = -1; while (itr.MoveNext()) { noRules = false; IntTaggedWord iTW = itr.Current; if (iTW.Tag() == @event.TagId()) { log.Info("GOLD-"); goldTagId = iTW.Tag(); } float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr()); logScores.IncrementCount(iTW.Tag(), tagScore); } if (noRules) { System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr()); } else { // Score the tagging int hypTagId = Counters.Argmax(logScores); if (hypTagId == goldTagId) { ++nCorrect; } else { string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId); errors.IncrementCount(goldTag); } } log.Info(); } // Output accuracy double acc = (double)nCorrect / (double)tuningSet.Count; System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0); log.Info("% of errors by type:"); IList <string> biggestKeys = new List <string>(errors.KeySet()); biggestKeys.Sort(Counters.ToComparator(errors, false, true)); Counters.Normalize(errors); foreach (string key in biggestKeys) { System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0); } }
// private static String stripTag(String tag) { // if (tag.startsWith("DT")) { // String newTag = tag.substring(2, tag.length()); // return newTag.length() > 0 ? newTag : tag; // } // return tag; // } /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 3) { System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName); System.Environment.Exit(-1); } Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; if (language.Equals(Language.Arabic)) { string[] options = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(options, 0); } else { string[] options = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(options, 0); } Treebank tb = tlpp.DiskTreebank(); tb.LoadPath(args[1]); MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification(); string[] features = args[2].Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } // Counters ICounter <string> wordTagCounter = new ClassicCounter <string>(30000); ICounter <string> morphTagCounter = new ClassicCounter <string>(500); // Counter<String> signatureTagCounter = new ClassicCounter<String>(); ICounter <string> morphCounter = new ClassicCounter <string>(500); ICounter <string> wordCounter = new ClassicCounter <string>(30000); ICounter <string> tagCounter = new ClassicCounter <string>(300); ICounter <string> lemmaCounter = new ClassicCounter <string>(25000); ICounter <string> lemmaTagCounter = new ClassicCounter <string>(25000); ICounter <string> richTagCounter = new ClassicCounter <string>(1000); ICounter <string> reducedTagCounter = new ClassicCounter <string>(500); ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500); IDictionary <string, ICollection <string> > wordLemmaMap = Generics.NewHashMap(); TwoDimensionalIntCounter <string, string> lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000); TwoDimensionalIntCounter <string, string> reducedTagTagCounter = new TwoDimensionalIntCounter <string, string>(500); TwoDimensionalIntCounter <string, string> tagReducedTagCounter = new TwoDimensionalIntCounter <string, string>(300); int numTrees = 0; foreach (Tree tree in tb) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } IList <ILabel> pretermList = tree.PreTerminalYield(); IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { string tag = pretermList[i].Value(); string word = yield[i].Value(); string morph = ((CoreLabel)yield[i]).OriginalText(); // Note: if there is no lemma, then we use the surface form. Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph); string lemma = lemmaTag.First(); string richTag = lemmaTag.Second(); // WSGDEBUG if (tag.Contains("MW")) { lemma += "-MWE"; } lemmaCounter.IncrementCount(lemma); lemmaTagCounter.IncrementCount(lemma + tag); richTagCounter.IncrementCount(richTag); string reducedTag = morphoSpec.StrToFeatures(richTag).ToString(); reducedTagCounter.IncrementCount(reducedTag); reducedTagLemmaCounter.IncrementCount(reducedTag + lemma); wordTagCounter.IncrementCount(word + tag); morphTagCounter.IncrementCount(morph + tag); morphCounter.IncrementCount(morph); wordCounter.IncrementCount(word); tagCounter.IncrementCount(tag); reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag; if (wordLemmaMap.Contains(word)) { wordLemmaMap[word].Add(lemma); } else { ICollection <string> lemmas = Generics.NewHashSet(1); wordLemmaMap[word] = lemmas; } lemmaReducedTagCounter.IncrementCount(lemma, reducedTag); reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag); tagReducedTagCounter.IncrementCount(tag, reducedTag); } ++numTrees; } // Barf... System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.Printf("#trees:\t%d%n", numTrees); System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount()); System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count); System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count); System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count); System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count); System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count); System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count); System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count); System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count); // Extra System.Console.Out.WriteLine("=================="); StringBuilder sbNoLemma = new StringBuilder(); StringBuilder sbMultLemmas = new StringBuilder(); foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap) { string word = wordLemmas.Key; ICollection <string> lemmas = wordLemmas.Value; if (lemmas.Count == 0) { sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n"); continue; } if (lemmas.Count > 1) { sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n"); continue; } string lemma = lemmas.GetEnumerator().Current; ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet(); if (reducedTags.Count > 1) { System.Console.Out.Printf("%s --> %s%n", word, lemma); foreach (string reducedTag in reducedTags) { int count = lemmaReducedTagCounter.GetCount(lemma, reducedTag); string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet()); System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags); } System.Console.Out.WriteLine(); } } System.Console.Out.WriteLine("=================="); System.Console.Out.WriteLine(sbNoLemma.ToString()); System.Console.Out.WriteLine(sbMultLemmas.ToString()); System.Console.Out.WriteLine("=================="); IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet()); tags.Sort(); foreach (string tag_1 in tags) { System.Console.Out.WriteLine(tag_1); ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet(); foreach (string reducedTag in reducedTags) { int count = tagReducedTagCounter.GetCount(tag_1, reducedTag); // reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count); } System.Console.Out.WriteLine(); } System.Console.Out.WriteLine("=================="); }
public TreeAnnotatorAndBinarizer(IHeadFinder annotationHF, IHeadFinder binarizationHF, ITreebankLangParserParams tlpParams, bool forceCNF, bool insideFactor, bool doSubcategorization, Options op) { this.trainOptions = op.trainOptions; if (doSubcategorization) { annotator = new TreeAnnotator(annotationHF, tlpParams, op); } else { annotator = new TreeAnnotatorAndBinarizer.TreeNullAnnotator(annotationHF); } binarizer = new TreeBinarizer(binarizationHF, tlpParams.TreebankLanguagePack(), insideFactor, trainOptions.markovFactor, trainOptions.markovOrder, trainOptions.CompactGrammar() > 0, trainOptions.CompactGrammar() > 1, trainOptions.HselCut, trainOptions .markFinalStates, trainOptions.simpleBinarizedLabels, trainOptions.noRebinarization); if (trainOptions.selectivePostSplit) { postSplitter = new PostSplitter(tlpParams, op); } else { postSplitter = null; } this.tf = new LabeledScoredTreeFactory(new CategoryWordTagFactory()); this.tlp = tlpParams.TreebankLanguagePack(); this.forceCNF = forceCNF; if (trainOptions.printAnnotatedRuleCounts) { annotatedRuleCounts = new ClassicCounter <Tree>(); } else { annotatedRuleCounts = null; } if (trainOptions.printAnnotatedStateCounts) { annotatedStateCounts = new ClassicCounter <string>(); } else { annotatedStateCounts = null; } }