public static void Main(string[] args) { ITreebankLangParserParams tlpParams = new ChineseTreebankParserParams(); ITreebankLanguagePack ctlp = tlpParams.TreebankLanguagePack(); Options op = new Options(tlpParams); TreeAnnotator ta = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); log.Info("Reading Trees..."); IFileFilter trainFilter = new NumberRangesFileFilter(args[1], true); Treebank trainTreebank = tlpParams.MemoryTreebank(); trainTreebank.LoadPath(args[0], trainFilter); log.Info("Annotating trees..."); ICollection <Tree> trainTrees = new List <Tree>(); foreach (Tree tree in trainTreebank) { trainTrees.Add(ta.TransformTree(tree)); } trainTreebank = null; // saves memory log.Info("Training lexicon..."); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); int featureLevel = DefaultFeatureLevel; if (args.Length > 3) { featureLevel = System.Convert.ToInt32(args[3]); } Edu.Stanford.Nlp.Parser.Lexparser.ChineseMaxentLexicon lex = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseMaxentLexicon(op, wordIndex, tagIndex, featureLevel); lex.InitializeTraining(trainTrees.Count); lex.Train(trainTrees); lex.FinishTraining(); log.Info("Testing"); IFileFilter testFilter = new NumberRangesFileFilter(args[2], true); Treebank testTreebank = tlpParams.MemoryTreebank(); testTreebank.LoadPath(args[0], testFilter); IList <TaggedWord> testWords = new List <TaggedWord>(); foreach (Tree t in testTreebank) { foreach (TaggedWord tw in t.TaggedYield()) { testWords.Add(tw); } } //testWords.addAll(t.taggedYield()); int[] totalAndCorrect = lex.TestOnTreebank(testWords); log.Info("done."); System.Console.Out.WriteLine(totalAndCorrect[1] + " correct out of " + totalAndCorrect[0] + " -- ACC: " + ((double)totalAndCorrect[1]) / totalAndCorrect[0]); }
public static Triple <string, IFileFilter, double> GetWeightedTreebankDescription(string[] args, int argIndex, string flag) { string path = null; IFileFilter filter = null; double weight = 1.0; // the next arguments are the treebank path and maybe the range for testing int numSubArgs = NumSubArgs(args, argIndex); if (numSubArgs > 0 && numSubArgs < 4) { argIndex++; path = args[argIndex++]; bool hasWeight = false; if (numSubArgs > 1 && DoublePattern.Matcher(args[argIndex + numSubArgs - 2]).Matches()) { weight = double.Parse(args[argIndex + numSubArgs - 2]); hasWeight = true; numSubArgs--; } if (numSubArgs == 2) { filter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs == 3) { try { int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); filter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? filter = new NumberRangesFileFilter(args[argIndex++], true); } } } if (hasWeight) { argIndex++; } } else { throw new ArgumentException("Bad arguments after " + flag); } return(Triple.MakeTriple(path, filter, weight)); }
private TaggedFileRecord(string file, TaggedFileRecord.Format format, string encoding, string tagSeparator, ITreeTransformer treeTransformer, TreeNormalizer treeNormalizer, ITreeReaderFactory trf, NumberRangesFileFilter treeRange, IPredicate <Tree> treeFilter, int wordColumn, int tagColumn) { // represents a tokenized file separated by text // represents a tsv file such as a conll file // represents a file in PTB format this.file = file; this.format = format; this.encoding = encoding; this.tagSeparator = tagSeparator; this.treeTransformer = treeTransformer; this.treeNormalizer = treeNormalizer; this.treeRange = treeRange; this.treeFilter = treeFilter; this.wordColumn = wordColumn; this.tagColumn = tagColumn; this.trf = trf; }
/// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { IDictionary <string, int> flagsToNumArgs = Generics.NewHashMap(); flagsToNumArgs["-parser"] = int.Parse(3); flagsToNumArgs["-lex"] = int.Parse(3); flagsToNumArgs["-test"] = int.Parse(2); flagsToNumArgs["-out"] = int.Parse(1); flagsToNumArgs["-lengthPenalty"] = int.Parse(1); flagsToNumArgs["-penaltyType"] = int.Parse(1); flagsToNumArgs["-maxLength"] = int.Parse(1); flagsToNumArgs["-stats"] = int.Parse(2); IDictionary <string, string[]> argMap = StringUtils.ArgsToMap(args, flagsToNumArgs); bool eval = argMap.Contains("-eval"); PrintWriter pw = null; if (argMap.Contains("-out")) { pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap["-out"])[0]), "GB18030"), true); } log.Info("ChineseCharacterBasedLexicon called with args:"); ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams(); for (int i = 0; i < args.Length; i++) { ctpp.SetOptionFlag(args, i); log.Info(" " + args[i]); } log.Info(); Options op = new Options(ctpp); if (argMap.Contains("-stats")) { string[] statArgs = (argMap["-stats"]); MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank(); IFileFilter trainFilt = new NumberRangesFileFilter(statArgs[1], false); rawTrainTreebank.LoadPath(new File(statArgs[0]), trainFilt); log.Info("Done reading trees."); MemoryTreebank trainTreebank; if (argMap.Contains("-annotate")) { trainTreebank = new MemoryTreebank(); TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op); foreach (Tree tree in rawTrainTreebank) { trainTreebank.Add(annotator.TransformTree(tree)); } log.Info("Done annotating trees."); } else { trainTreebank = rawTrainTreebank; } PrintStats(trainTreebank, pw); System.Environment.Exit(0); } int maxLength = 1000000; // Test.verbose = true; if (argMap.Contains("-norm")) { op.testOptions.lengthNormalization = true; } if (argMap.Contains("-maxLength")) { maxLength = System.Convert.ToInt32((argMap["-maxLength"])[0]); } op.testOptions.maxLength = 120; bool combo = argMap.Contains("-combo"); if (combo) { ctpp.useCharacterBasedLexicon = true; op.testOptions.maxSpanForTags = 10; op.doDep = false; op.dcTags = false; } LexicalizedParser lp = null; ILexicon lex = null; if (argMap.Contains("-parser")) { string[] parserArgs = (argMap["-parser"]); if (parserArgs.Length > 1) { IFileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false); lp = LexicalizedParser.TrainFromTreebank(parserArgs[0], trainFilt, op); if (parserArgs.Length == 3) { string filename = parserArgs[2]; log.Info("Writing parser in serialized format to file " + filename + " "); System.Console.Error.Flush(); ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename); @out.WriteObject(lp); @out.Close(); log.Info("done."); } } else { string parserFile = parserArgs[0]; lp = LexicalizedParser.LoadModel(parserFile, op); } lex = lp.GetLexicon(); op = lp.GetOp(); ctpp = (ChineseTreebankParserParams)op.tlpParams; } if (argMap.Contains("-rad")) { ctpp.useUnknownCharacterModel = true; } if (argMap.Contains("-lengthPenalty")) { ctpp.lengthPenalty = double.Parse((argMap["-lengthPenalty"])[0]); } if (argMap.Contains("-penaltyType")) { ctpp.penaltyType = System.Convert.ToInt32((argMap["-penaltyType"])[0]); } if (argMap.Contains("-lex")) { string[] lexArgs = (argMap["-lex"]); if (lexArgs.Length > 1) { IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); lex = ctpp.Lex(op, wordIndex, tagIndex); MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank(); IFileFilter trainFilt = new NumberRangesFileFilter(lexArgs[1], false); rawTrainTreebank.LoadPath(new File(lexArgs[0]), trainFilt); log.Info("Done reading trees."); MemoryTreebank trainTreebank; if (argMap.Contains("-annotate")) { trainTreebank = new MemoryTreebank(); TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op); foreach (Tree tree in rawTrainTreebank) { tree = annotator.TransformTree(tree); trainTreebank.Add(tree); } log.Info("Done annotating trees."); } else { trainTreebank = rawTrainTreebank; } lex.InitializeTraining(trainTreebank.Count); lex.Train(trainTreebank); lex.FinishTraining(); log.Info("Done training lexicon."); if (lexArgs.Length == 3) { string filename = lexArgs.Length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz"; log.Info("Writing lexicon in serialized format to file " + filename + " "); System.Console.Error.Flush(); ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename); @out.WriteObject(lex); @out.Close(); log.Info("done."); } } else { string lexFile = lexArgs.Length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz"; log.Info("Reading Lexicon from file " + lexFile); ObjectInputStream @in = IOUtils.ReadStreamFromString(lexFile); try { lex = (ILexicon)@in.ReadObject(); } catch (TypeLoadException) { throw new Exception("Bad serialized file: " + lexFile); } @in.Close(); } } if (argMap.Contains("-test")) { bool segmentWords = ctpp.segment; bool parse = lp != null; System.Diagnostics.Debug.Assert((parse || segmentWords)); // WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords"); // WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags"); IWordSegmenter seg = null; if (segmentWords) { seg = (IWordSegmenter)lex; } string[] testArgs = (argMap["-test"]); MemoryTreebank testTreebank = op.tlpParams.MemoryTreebank(); IFileFilter testFilt = new NumberRangesFileFilter(testArgs[1], false); testTreebank.LoadPath(new File(testArgs[0]), testFilt); ITreeTransformer subcategoryStripper = op.tlpParams.SubcategoryStripper(); ITreeTransformer collinizer = ctpp.Collinizer(); WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser(); WordCatEqualityChecker eqcheck = new WordCatEqualityChecker(); EquivalenceClassEval basicEval = new EquivalenceClassEval(eqclass, eqcheck, "basic"); EquivalenceClassEval collinsEval = new EquivalenceClassEval(eqclass, eqcheck, "collinized"); IList <string> evalTypes = new List <string>(3); bool goodPOS = false; if (segmentWords) { evalTypes.Add(WordCatConstituent.wordType); if (ctpp.segmentMarkov && !parse) { evalTypes.Add(WordCatConstituent.tagType); goodPOS = true; } } if (parse) { evalTypes.Add(WordCatConstituent.tagType); evalTypes.Add(WordCatConstituent.catType); if (combo) { evalTypes.Add(WordCatConstituent.wordType); goodPOS = true; } } TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes); log.Info("Testing..."); foreach (Tree goldTop in testTreebank) { Tree gold = goldTop.FirstChild(); IList <IHasWord> goldSentence = gold.YieldHasWord(); if (goldSentence.Count > maxLength) { log.Info("Skipping sentence; too long: " + goldSentence.Count); continue; } else { log.Info("Processing sentence; length: " + goldSentence.Count); } IList <IHasWord> s; if (segmentWords) { StringBuilder goldCharBuf = new StringBuilder(); foreach (IHasWord aGoldSentence in goldSentence) { StringLabel word = (StringLabel)aGoldSentence; goldCharBuf.Append(word.Value()); } string goldChars = goldCharBuf.ToString(); s = seg.Segment(goldChars); } else { s = goldSentence; } Tree tree; if (parse) { tree = lp.ParseTree(s); if (tree == null) { throw new Exception("PARSER RETURNED NULL!!!"); } } else { tree = Edu.Stanford.Nlp.Trees.Trees.ToFlatTree(s); tree = subcategoryStripper.TransformTree(tree); } if (pw != null) { if (parse) { tree.PennPrint(pw); } else { IEnumerator sentIter = s.GetEnumerator(); for (; ;) { Word word = (Word)sentIter.Current; pw.Print(word.Word()); if (sentIter.MoveNext()) { pw.Print(" "); } else { break; } } } pw.Println(); } if (eval) { ICollection ourBrackets; ICollection goldBrackets; ourBrackets = proc.AllBrackets(tree); goldBrackets = proc.AllBrackets(gold); if (goodPOS) { Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(tree, gold)); Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(gold, tree)); } basicEval.Eval(ourBrackets, goldBrackets); System.Console.Out.WriteLine("\nScores:"); basicEval.DisplayLast(); Tree collinsTree = collinizer.TransformTree(tree); Tree collinsGold = collinizer.TransformTree(gold); ourBrackets = proc.AllBrackets(collinsTree); goldBrackets = proc.AllBrackets(collinsGold); if (goodPOS) { Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsTree, collinsGold)); Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsGold, collinsTree)); } collinsEval.Eval(ourBrackets, goldBrackets); System.Console.Out.WriteLine("\nCollinized scores:"); collinsEval.DisplayLast(); System.Console.Out.WriteLine(); } } if (eval) { basicEval.Display(); System.Console.Out.WriteLine(); collinsEval.Display(); } } }
/// <summary> /// This method lets you train and test a segmenter relative to a /// Treebank. /// </summary> /// <remarks> /// This method lets you train and test a segmenter relative to a /// Treebank. /// <p> /// <i>Implementation note:</i> This method is largely cloned from /// LexicalizedParser's main method. Should we try to have it be able /// to train segmenters to stop things going out of sync? /// </remarks> public static void Main(string[] args) { bool train = false; bool saveToSerializedFile = false; bool saveToTextFile = false; string serializedInputFileOrUrl = null; string textInputFileOrUrl = null; string serializedOutputFileOrUrl = null; string textOutputFileOrUrl = null; string treebankPath = null; Treebank testTreebank = null; // Treebank tuneTreebank = null; string testPath = null; IFileFilter testFilter = null; IFileFilter trainFilter = null; string encoding = null; // variables needed to process the files to be parsed ITokenizerFactory <Word> tokenizerFactory = null; // DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(); bool tokenized = false; // whether or not the input file has already been tokenized IFunction <IList <IHasWord>, IList <IHasWord> > escaper = new ChineseEscaper(); // int tagDelimiter = -1; // String sentenceDelimiter = "\n"; // boolean fromXML = false; int argIndex = 0; if (args.Length < 1) { log.Info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*"); return; } Options op = new Options(); op.tlpParams = new ChineseTreebankParserParams(); // while loop through option arguments while (argIndex < args.Length && args[argIndex][0] == '-') { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train")) { train = true; saveToSerializedFile = true; int numSubArgs = NumSubArgs(args, argIndex); argIndex++; if (numSubArgs > 1) { treebankPath = args[argIndex]; argIndex++; } else { throw new Exception("Error: -train option must have treebankPath as first argument."); } if (numSubArgs == 2) { trainFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs >= 3) { try { int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? trainFilter = new NumberRangesFileFilter(args[argIndex], true); argIndex++; } } } } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-encoding")) { // sets encoding for TreebankLangParserParams encoding = args[argIndex + 1]; op.tlpParams.SetInputEncoding(encoding); op.tlpParams.SetOutputEncoding(encoding); argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-loadFromSerializedFile")) { // load the parser from a binary serialized file // the next argument must be the path to the parser file serializedInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { // doesn't make sense to load from TextFile -pichuan // } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) { // // load the parser from declarative text file // // the next argument must be the path to the parser file // textInputFileOrUrl = args[argIndex + 1]; // argIndex += 2; if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToSerializedFile")) { saveToSerializedFile = true; serializedOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToTextFile")) { // save the parser to declarative text file saveToTextFile = true; textOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")) { // the next argument is the treebank path and range for testing int numSubArgs = NumSubArgs(args, argIndex); argIndex++; if (numSubArgs == 1) { testFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs > 1) { testPath = args[argIndex++]; if (numSubArgs == 2) { testFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs >= 3) { try { int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); testFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? testFilter = new NumberRangesFileFilter(args[argIndex++], true); } } } } } } else { int j = op.tlpParams.SetOptionFlag(args, argIndex); if (j == argIndex) { log.Info("Unknown option ignored: " + args[argIndex]); j++; } argIndex = j; } } } } } } } // end while loop through arguments ITreebankLangParserParams tlpParams = op.tlpParams; // all other arguments are order dependent and // are processed in order below Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null; if (!train && op.testOptions.verbose) { System.Console.Out.WriteLine("Currently " + new DateTime()); PrintArgs(args, System.Console.Out); } if (train) { PrintArgs(args, System.Console.Out); // so we train a parser using the treebank if (treebankPath == null) { // the next arg must be the treebank path, since it wasn't give earlier treebankPath = args[argIndex]; argIndex++; if (args.Length > argIndex + 1) { try { // the next two args might be the range int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? trainFilter = new NumberRangesFileFilter(args[argIndex], true); argIndex++; } } } Treebank trainTreebank = MakeTreebank(treebankPath, op, trainFilter); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex); } else { if (textInputFileOrUrl != null) { } else { // so we load the segmenter from a text grammar file // XXXXX fix later -pichuan //cs = new LexicalizedParser(textInputFileOrUrl, true, op); // so we load a serialized segmenter if (serializedInputFileOrUrl == null) { // the next argument must be the path to the serialized parser serializedInputFileOrUrl = args[argIndex]; argIndex++; } try { cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op); } catch (ArgumentException) { log.Info("Error loading segmenter, exiting..."); System.Environment.Exit(0); } } } // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test TreePrint treePrint = op.testOptions.TreePrint(tlpParams); if (testFilter != null) { if (testPath == null) { if (treebankPath == null) { throw new Exception("No test treebank path specified..."); } else { log.Info("No test treebank path specified. Using train path: \"" + treebankPath + "\""); testPath = treebankPath; } } testTreebank = tlpParams.TestMemoryTreebank(); testTreebank.LoadPath(testPath, testFilter); } op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(tlpParams.SisterSplitters())); // at this point we should be sure that op.tlpParams is // set appropriately (from command line, or from grammar file), // and will never change again. We also set the tlpParams of the // LexicalizedParser instance to be the same object. This is // redundancy that we probably should take out eventually. // // -- Roger if (op.testOptions.verbose) { log.Info("Lexicon is " + cs.GetType().FullName); } PrintWriter pwOut = tlpParams.Pw(); PrintWriter pwErr = tlpParams.Pw(System.Console.Error); // Now what do we do with the parser we've made if (saveToTextFile) { // save the parser to textGrammar format if (textOutputFileOrUrl != null) { SaveSegmenterDataToText(cs, textOutputFileOrUrl); } else { log.Info("Usage: must specify a text segmenter data output path"); } } if (saveToSerializedFile) { if (serializedOutputFileOrUrl == null && argIndex < args.Length) { // the next argument must be the path to serialize to serializedOutputFileOrUrl = args[argIndex]; argIndex++; } if (serializedOutputFileOrUrl != null) { SaveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl); } else { if (textOutputFileOrUrl == null && testTreebank == null) { // no saving/parsing request has been specified log.Info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename"); } } } /* --------------------- Testing part!!!! ----------------------- */ if (op.testOptions.verbose) { } // printOptions(false, op); if (testTreebank != null || (argIndex < args.Length && Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))) { // test parser on treebank if (testTreebank == null) { // the next argument is the treebank path and range for testing testTreebank = tlpParams.TestMemoryTreebank(); if (args.Length < argIndex + 4) { testTreebank.LoadPath(args[argIndex + 1]); } else { int testlow = System.Convert.ToInt32(args[argIndex + 2]); int testhigh = System.Convert.ToInt32(args[argIndex + 3]); testTreebank.LoadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true)); } } } }
public static Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord CreateRecord(Properties config, string description) { string[] pieces = description.Split(","); if (pieces.Length == 1) { return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(description, TaggedFileRecord.Format.Text, GetEncoding(config), GetTagSeparator(config), null, null, null, null, null, null, null)); } string[] args = new string[pieces.Length - 1]; System.Array.Copy(pieces, 0, args, 0, pieces.Length - 1); string file = pieces[pieces.Length - 1]; TaggedFileRecord.Format format = TaggedFileRecord.Format.Text; string encoding = GetEncoding(config); string tagSeparator = GetTagSeparator(config); ITreeTransformer treeTransformer = null; TreeNormalizer treeNormalizer = null; ITreeReaderFactory trf = null; NumberRangesFileFilter treeRange = null; IPredicate <Tree> treeFilter = null; int wordColumn = null; int tagColumn = null; foreach (string arg in args) { string[] argPieces = arg.Split("=", 2); if (argPieces.Length != 2) { throw new ArgumentException("TaggedFileRecord argument " + arg + " has an unexpected number of =s"); } if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Format)) { format = TaggedFileRecord.Format.ValueOf(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Encoding)) { encoding = argPieces[1]; } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagSeparator)) { tagSeparator = argPieces[1]; } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeTransformer)) { treeTransformer = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeNormalizer)) { treeNormalizer = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeReader)) { trf = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeRange)) { string range = argPieces[1].ReplaceAll(":", ","); treeRange = new NumberRangesFileFilter(range, true); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeFilter)) { treeFilter = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], WordColumn)) { wordColumn = int.Parse(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagColumn)) { tagColumn = int.Parse(argPieces[1]); } else { throw new ArgumentException("TaggedFileRecord argument " + argPieces[0] + " is unknown"); } } } } } } } } } } } return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(file, format, encoding, tagSeparator, treeTransformer, treeNormalizer, trf, treeRange, treeFilter, wordColumn, tagColumn)); }
/// <summary>Lets you test out the TreeAnnotatorAndBinarizer on the command line.</summary> /// <param name="args"> /// Command line arguments: All flags accepted by FactoredParser.setOptionFlag /// and -train treebankPath [fileRanges] /// </param> public static void Main(string[] args) { Options op = new Options(); string treebankPath = null; IFileFilter trainFilter = null; int i = 0; while (i < args.Length && args[i].StartsWith("-")) { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-train")) { int numSubArgs = NumSubArgs(args, i); i++; if (numSubArgs >= 1) { treebankPath = args[i]; i++; } else { throw new Exception("Error: -train option must have treebankPath as first argument."); } if (numSubArgs == 2) { trainFilter = new NumberRangesFileFilter(args[i++], true); } else { if (numSubArgs >= 3) { int low = System.Convert.ToInt32(args[i]); int high = System.Convert.ToInt32(args[i + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); i += 2; } } } else { i = op.SetOption(args, i); } } if (i < args.Length) { log.Info("usage: java TreeAnnotatorAndBinarizer options*"); log.Info(" Options are like for lexicalized parser including -train treebankPath fileRange]"); return; } log.Info("Annotating from treebank dir: " + treebankPath); Treebank trainTreebank = op.tlpParams.DiskTreebank(); if (trainFilter == null) { trainTreebank.LoadPath(treebankPath); } else { trainTreebank.LoadPath(treebankPath, trainFilter); } Treebank binaryTrainTreebank = GetAnnotatedBinaryTreebankFromTreebank(trainTreebank, null, null, op).First(); IEnumerator <Tree> it = trainTreebank.GetEnumerator(); foreach (Tree t in binaryTrainTreebank) { System.Console.Out.WriteLine("Original tree:"); it.Current.PennPrint(); System.Console.Out.WriteLine("Binarized tree:"); t.PennPrint(); System.Console.Out.WriteLine(); } }
/// <summary>for testing -- CURRENTLY BROKEN!!!</summary> /// <param name="args">input dir and output filename</param> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { if (args.Length != 3) { throw new Exception("args: treebankPath trainNums testNums"); } ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams(); ctpp.charTags = true; // TODO: these options are getting clobbered by reading in the // parser object (unless it's a text file parser?) Options op = new Options(ctpp); op.doDep = false; op.testOptions.maxLength = 90; LexicalizedParser lp; try { IFileFilter trainFilt = new NumberRangesFileFilter(args[1], false); lp = LexicalizedParser.TrainFromTreebank(args[0], trainFilt, op); try { string filename = "chineseCharTagPCFG.ser.gz"; log.Info("Writing parser in serialized format to file " + filename + " "); System.Console.Error.Flush(); ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename); @out.WriteObject(lp); @out.Close(); log.Info("done."); } catch (IOException ioe) { Sharpen.Runtime.PrintStackTrace(ioe); } } catch (ArgumentException) { lp = LexicalizedParser.LoadModel(args[1], op); } IFileFilter testFilt = new NumberRangesFileFilter(args[2], false); MemoryTreebank testTreebank = ctpp.MemoryTreebank(); testTreebank.LoadPath(new File(args[0]), testFilt); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true); WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser(); WordCatEqualityChecker eqcheck = new WordCatEqualityChecker(); EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck); // System.out.println("Preterminals:" + preterminals); System.Console.Out.WriteLine("Testing..."); foreach (Tree gold in testTreebank) { Tree tree; try { tree = lp.ParseTree(gold.YieldHasWord()); if (tree == null) { System.Console.Out.WriteLine("Failed to parse " + gold.YieldHasWord()); continue; } } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e); continue; } gold = gold.FirstChild(); pw.Println(SentenceUtils.ListToString(gold.PreTerminalYield())); pw.Println(SentenceUtils.ListToString(gold.Yield())); gold.PennPrint(pw); pw.Println(tree.PreTerminalYield()); pw.Println(tree.Yield()); tree.PennPrint(pw); // Collection allBrackets = WordCatConstituent.allBrackets(tree); // Collection goldBrackets = WordCatConstituent.allBrackets(gold); // eval.eval(allBrackets, goldBrackets); eval.DisplayLast(); } System.Console.Out.WriteLine(); System.Console.Out.WriteLine(); eval.Display(); }