protected internal static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromSerializedFile(string serializedFileOrUrl) { Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null; try { log.Info("Loading segmenter from serialized file " + serializedFileOrUrl + " ..."); ObjectInputStream @in; InputStream @is; if (serializedFileOrUrl.StartsWith("http://")) { URL u = new URL(serializedFileOrUrl); URLConnection uc = u.OpenConnection(); @is = uc.GetInputStream(); } else { @is = new FileInputStream(serializedFileOrUrl); } if (serializedFileOrUrl.EndsWith(".gz")) { // it's faster to do the buffering _outside_ the gzipping as here @in = new ObjectInputStream(new BufferedInputStream(new GZIPInputStream(@is))); } else { @in = new ObjectInputStream(new BufferedInputStream(@is)); } cs = (Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter)@in.ReadObject(); @in.Close(); log.Info(" done."); return(cs); } catch (InvalidClassException ice) { // For this, it's not a good idea to continue and try it as a text file! log.Info(); // as in middle of line from above message throw new Exception(ice); } catch (FileNotFoundException fnfe) { // For this, it's not a good idea to continue and try it as a text file! log.Info(); // as in middle of line from above message throw new Exception(fnfe); } catch (StreamCorruptedException) { } catch (Exception e) { // suppress error message, on the assumption that we've really got // a text grammar, and that'll be tried next log.Info(); // as in middle of line from above message Sharpen.Runtime.PrintStackTrace(e); } return(null); }
/// <summary>Construct a new ChineseLexiconAndWordSegmenter.</summary> /// <remarks> /// Construct a new ChineseLexiconAndWordSegmenter. This loads a segmenter file that /// was previously assembled and stored. /// </remarks> /// <exception cref="System.ArgumentException">If segmenter data cannot be loaded</exception> public ChineseLexiconAndWordSegmenter(string segmenterFileOrUrl, Options op) { Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = GetSegmenterDataFromFile(segmenterFileOrUrl, op); this.op = cs.op; // in case a serialized options was read in chineseLexicon = cs.chineseLexicon; wordSegmenter = cs.wordSegmenter; }
public static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromFile(string parserFileOrUrl, Options op) { Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = GetSegmenterDataFromSerializedFile(parserFileOrUrl); if (cs == null) { } // pd = getSegmenterDataFromTextFile(parserFileOrUrl, op); return(cs); }
internal static void SaveSegmenterDataToSerialized(Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs, string filename) { try { log.Info("Writing segmenter in serialized format to file " + filename + " "); ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename); @out.WriteObject(cs); @out.Close(); log.Info("done."); } catch (IOException ioe) { Sharpen.Runtime.PrintStackTrace(ioe); } }
internal static void SaveSegmenterDataToText(Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs, string filename) { try { log.Info("Writing parser in text grammar format to file " + filename); OutputStream os; if (filename.EndsWith(".gz")) { // it's faster to do the buffering _outside_ the gzipping as here os = new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(filename))); } else { os = new BufferedOutputStream(new FileOutputStream(filename)); } PrintWriter @out = new PrintWriter(os); string prefix = "BEGIN "; // out.println(prefix + "OPTIONS"); // if (pd.pt != null) { // pd.pt.writeData(out); // } // out.println(); // log.info("."); @out.Println(prefix + "LEXICON"); if (cs != null) { cs.WriteData(@out); } @out.Println(); log.Info("."); @out.Flush(); @out.Close(); log.Info("done."); } catch (IOException e) { log.Info("Trouble saving segmenter data to ASCII format."); Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary> /// This method lets you train and test a segmenter relative to a /// Treebank. /// </summary> /// <remarks> /// This method lets you train and test a segmenter relative to a /// Treebank. /// <p> /// <i>Implementation note:</i> This method is largely cloned from /// LexicalizedParser's main method. Should we try to have it be able /// to train segmenters to stop things going out of sync? /// </remarks> public static void Main(string[] args) { bool train = false; bool saveToSerializedFile = false; bool saveToTextFile = false; string serializedInputFileOrUrl = null; string textInputFileOrUrl = null; string serializedOutputFileOrUrl = null; string textOutputFileOrUrl = null; string treebankPath = null; Treebank testTreebank = null; // Treebank tuneTreebank = null; string testPath = null; IFileFilter testFilter = null; IFileFilter trainFilter = null; string encoding = null; // variables needed to process the files to be parsed ITokenizerFactory <Word> tokenizerFactory = null; // DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(); bool tokenized = false; // whether or not the input file has already been tokenized IFunction <IList <IHasWord>, IList <IHasWord> > escaper = new ChineseEscaper(); // int tagDelimiter = -1; // String sentenceDelimiter = "\n"; // boolean fromXML = false; int argIndex = 0; if (args.Length < 1) { log.Info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*"); return; } Options op = new Options(); op.tlpParams = new ChineseTreebankParserParams(); // while loop through option arguments while (argIndex < args.Length && args[argIndex][0] == '-') { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train")) { train = true; saveToSerializedFile = true; int numSubArgs = NumSubArgs(args, argIndex); argIndex++; if (numSubArgs > 1) { treebankPath = args[argIndex]; argIndex++; } else { throw new Exception("Error: -train option must have treebankPath as first argument."); } if (numSubArgs == 2) { trainFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs >= 3) { try { int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? trainFilter = new NumberRangesFileFilter(args[argIndex], true); argIndex++; } } } } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-encoding")) { // sets encoding for TreebankLangParserParams encoding = args[argIndex + 1]; op.tlpParams.SetInputEncoding(encoding); op.tlpParams.SetOutputEncoding(encoding); argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-loadFromSerializedFile")) { // load the parser from a binary serialized file // the next argument must be the path to the parser file serializedInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { // doesn't make sense to load from TextFile -pichuan // } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) { // // load the parser from declarative text file // // the next argument must be the path to the parser file // textInputFileOrUrl = args[argIndex + 1]; // argIndex += 2; if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToSerializedFile")) { saveToSerializedFile = true; serializedOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToTextFile")) { // save the parser to declarative text file saveToTextFile = true; textOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")) { // the next argument is the treebank path and range for testing int numSubArgs = NumSubArgs(args, argIndex); argIndex++; if (numSubArgs == 1) { testFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs > 1) { testPath = args[argIndex++]; if (numSubArgs == 2) { testFilter = new NumberRangesFileFilter(args[argIndex++], true); } else { if (numSubArgs >= 3) { try { int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); testFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? testFilter = new NumberRangesFileFilter(args[argIndex++], true); } } } } } } else { int j = op.tlpParams.SetOptionFlag(args, argIndex); if (j == argIndex) { log.Info("Unknown option ignored: " + args[argIndex]); j++; } argIndex = j; } } } } } } } // end while loop through arguments ITreebankLangParserParams tlpParams = op.tlpParams; // all other arguments are order dependent and // are processed in order below Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null; if (!train && op.testOptions.verbose) { System.Console.Out.WriteLine("Currently " + new DateTime()); PrintArgs(args, System.Console.Out); } if (train) { PrintArgs(args, System.Console.Out); // so we train a parser using the treebank if (treebankPath == null) { // the next arg must be the treebank path, since it wasn't give earlier treebankPath = args[argIndex]; argIndex++; if (args.Length > argIndex + 1) { try { // the next two args might be the range int low = System.Convert.ToInt32(args[argIndex]); int high = System.Convert.ToInt32(args[argIndex + 1]); trainFilter = new NumberRangeFileFilter(low, high, true); argIndex += 2; } catch (NumberFormatException) { // maybe it's a ranges expression? trainFilter = new NumberRangesFileFilter(args[argIndex], true); argIndex++; } } } Treebank trainTreebank = MakeTreebank(treebankPath, op, trainFilter); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex); } else { if (textInputFileOrUrl != null) { } else { // so we load the segmenter from a text grammar file // XXXXX fix later -pichuan //cs = new LexicalizedParser(textInputFileOrUrl, true, op); // so we load a serialized segmenter if (serializedInputFileOrUrl == null) { // the next argument must be the path to the serialized parser serializedInputFileOrUrl = args[argIndex]; argIndex++; } try { cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op); } catch (ArgumentException) { log.Info("Error loading segmenter, exiting..."); System.Environment.Exit(0); } } } // the following has to go after reading parser to make sure // op and tlpParams are the same for train and test TreePrint treePrint = op.testOptions.TreePrint(tlpParams); if (testFilter != null) { if (testPath == null) { if (treebankPath == null) { throw new Exception("No test treebank path specified..."); } else { log.Info("No test treebank path specified. Using train path: \"" + treebankPath + "\""); testPath = treebankPath; } } testTreebank = tlpParams.TestMemoryTreebank(); testTreebank.LoadPath(testPath, testFilter); } op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(tlpParams.SisterSplitters())); // at this point we should be sure that op.tlpParams is // set appropriately (from command line, or from grammar file), // and will never change again. We also set the tlpParams of the // LexicalizedParser instance to be the same object. This is // redundancy that we probably should take out eventually. // // -- Roger if (op.testOptions.verbose) { log.Info("Lexicon is " + cs.GetType().FullName); } PrintWriter pwOut = tlpParams.Pw(); PrintWriter pwErr = tlpParams.Pw(System.Console.Error); // Now what do we do with the parser we've made if (saveToTextFile) { // save the parser to textGrammar format if (textOutputFileOrUrl != null) { SaveSegmenterDataToText(cs, textOutputFileOrUrl); } else { log.Info("Usage: must specify a text segmenter data output path"); } } if (saveToSerializedFile) { if (serializedOutputFileOrUrl == null && argIndex < args.Length) { // the next argument must be the path to serialize to serializedOutputFileOrUrl = args[argIndex]; argIndex++; } if (serializedOutputFileOrUrl != null) { SaveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl); } else { if (textOutputFileOrUrl == null && testTreebank == null) { // no saving/parsing request has been specified log.Info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename"); } } } /* --------------------- Testing part!!!! ----------------------- */ if (op.testOptions.verbose) { } // printOptions(false, op); if (testTreebank != null || (argIndex < args.Length && Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))) { // test parser on treebank if (testTreebank == null) { // the next argument is the treebank path and range for testing testTreebank = tlpParams.TestMemoryTreebank(); if (args.Length < argIndex + 4) { testTreebank.LoadPath(args[argIndex + 1]); } else { int testlow = System.Convert.ToInt32(args[argIndex + 2]); int testhigh = System.Convert.ToInt32(args[argIndex + 3]); testTreebank.LoadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true)); } } } }
private static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromTreebank(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) { System.Console.Out.WriteLine("Currently " + new DateTime()); // printOptions(true, op); Timing.StartTime(); // setup tree transforms ITreebankLangParserParams tlpParams = op.tlpParams; if (op.testOptions.verbose) { System.Console.Out.Write("Training "); System.Console.Out.WriteLine(trainTreebank.TextualSummary()); } System.Console.Out.Write("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer; // initialized below if (!op.trainOptions.leftToRight) { binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); } else { binarizer = new TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); } CollinsPuncTransformer collinsPuncTransformer = null; if (op.trainOptions.collinsPunc) { collinsPuncTransformer = new CollinsPuncTransformer(tlpParams.TreebankLanguagePack()); } IList <Tree> binaryTrainTrees = new List <Tree>(); // List<Tree> binaryTuneTrees = new ArrayList<Tree>(); if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, true, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlpParams.TreebankLanguagePack()); if (op.testOptions.verbose) { log.Info("Parent split categories: " + op.trainOptions.splitters); } } if (op.trainOptions.selectivePostSplit) { ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); Treebank annotatedTB = trainTreebank.Transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlpParams.TreebankLanguagePack()); if (op.testOptions.verbose) { log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in trainTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.TransformTree(tree); } tree = binarizer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); } foreach (Tree tree_1 in trainTreebank) { if (op.trainOptions.collinsPunc) { tree_1 = collinsPuncTransformer.TransformTree(tree_1); } tree_1 = binarizer.TransformTree(tree_1); binaryTrainTrees.Add(tree_1); } Timing.Tick("done."); if (op.testOptions.verbose) { binarizer.DumpStats(); } System.Console.Out.Write("Extracting Lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter clex = (Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter)op.tlpParams.Lex(op, wordIndex, tagIndex); clex.InitializeTraining(binaryTrainTrees.Count); clex.Train(binaryTrainTrees); clex.FinishTraining(); Timing.Tick("done."); return(clex); }
private ChineseLexiconAndWordSegmenter(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) { Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = GetSegmenterDataFromTreebank(trainTreebank, op, wordIndex, tagIndex); chineseLexicon = cs.chineseLexicon; wordSegmenter = cs.wordSegmenter; }