Пример #1
0
 protected internal static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromSerializedFile(string serializedFileOrUrl)
 {
     Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null;
     try
     {
         log.Info("Loading segmenter from serialized file " + serializedFileOrUrl + " ...");
         ObjectInputStream @in;
         InputStream       @is;
         if (serializedFileOrUrl.StartsWith("http://"))
         {
             URL           u  = new URL(serializedFileOrUrl);
             URLConnection uc = u.OpenConnection();
             @is = uc.GetInputStream();
         }
         else
         {
             @is = new FileInputStream(serializedFileOrUrl);
         }
         if (serializedFileOrUrl.EndsWith(".gz"))
         {
             // it's faster to do the buffering _outside_ the gzipping as here
             @in = new ObjectInputStream(new BufferedInputStream(new GZIPInputStream(@is)));
         }
         else
         {
             @in = new ObjectInputStream(new BufferedInputStream(@is));
         }
         cs = (Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter)@in.ReadObject();
         @in.Close();
         log.Info(" done.");
         return(cs);
     }
     catch (InvalidClassException ice)
     {
         // For this, it's not a good idea to continue and try it as a text file!
         log.Info();
         // as in middle of line from above message
         throw new Exception(ice);
     }
     catch (FileNotFoundException fnfe)
     {
         // For this, it's not a good idea to continue and try it as a text file!
         log.Info();
         // as in middle of line from above message
         throw new Exception(fnfe);
     }
     catch (StreamCorruptedException)
     {
     }
     catch (Exception e)
     {
         // suppress error message, on the assumption that we've really got
         // a text grammar, and that'll be tried next
         log.Info();
         // as in middle of line from above message
         Sharpen.Runtime.PrintStackTrace(e);
     }
     return(null);
 }
Пример #2
0
 /// <summary>Construct a new ChineseLexiconAndWordSegmenter.</summary>
 /// <remarks>
 /// Construct a new ChineseLexiconAndWordSegmenter.  This loads a segmenter file that
 /// was previously assembled and stored.
 /// </remarks>
 /// <exception cref="System.ArgumentException">If segmenter data cannot be loaded</exception>
 public ChineseLexiconAndWordSegmenter(string segmenterFileOrUrl, Options op)
 {
     Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = GetSegmenterDataFromFile(segmenterFileOrUrl, op);
     this.op = cs.op;
     // in case a serialized options was read in
     chineseLexicon = cs.chineseLexicon;
     wordSegmenter  = cs.wordSegmenter;
 }
Пример #3
0
 public static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromFile(string parserFileOrUrl, Options op)
 {
     Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = GetSegmenterDataFromSerializedFile(parserFileOrUrl);
     if (cs == null)
     {
     }
     //      pd = getSegmenterDataFromTextFile(parserFileOrUrl, op);
     return(cs);
 }
Пример #4
0
 internal static void SaveSegmenterDataToSerialized(Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs, string filename)
 {
     try
     {
         log.Info("Writing segmenter in serialized format to file " + filename + " ");
         ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename);
         @out.WriteObject(cs);
         @out.Close();
         log.Info("done.");
     }
     catch (IOException ioe)
     {
         Sharpen.Runtime.PrintStackTrace(ioe);
     }
 }
Пример #5
0
 internal static void SaveSegmenterDataToText(Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs, string filename)
 {
     try
     {
         log.Info("Writing parser in text grammar format to file " + filename);
         OutputStream os;
         if (filename.EndsWith(".gz"))
         {
             // it's faster to do the buffering _outside_ the gzipping as here
             os = new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(filename)));
         }
         else
         {
             os = new BufferedOutputStream(new FileOutputStream(filename));
         }
         PrintWriter @out   = new PrintWriter(os);
         string      prefix = "BEGIN ";
         //      out.println(prefix + "OPTIONS");
         //      if (pd.pt != null) {
         //        pd.pt.writeData(out);
         //      }
         //      out.println();
         //      log.info(".");
         @out.Println(prefix + "LEXICON");
         if (cs != null)
         {
             cs.WriteData(@out);
         }
         @out.Println();
         log.Info(".");
         @out.Flush();
         @out.Close();
         log.Info("done.");
     }
     catch (IOException e)
     {
         log.Info("Trouble saving segmenter data to ASCII format.");
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
Пример #6
0
        /// <summary>
        /// This method lets you train and test a segmenter relative to a
        /// Treebank.
        /// </summary>
        /// <remarks>
        /// This method lets you train and test a segmenter relative to a
        /// Treebank.
        /// <p>
        /// <i>Implementation note:</i> This method is largely cloned from
        /// LexicalizedParser's main method.  Should we try to have it be able
        /// to train segmenters to stop things going out of sync?
        /// </remarks>
        public static void Main(string[] args)
        {
            bool     train = false;
            bool     saveToSerializedFile      = false;
            bool     saveToTextFile            = false;
            string   serializedInputFileOrUrl  = null;
            string   textInputFileOrUrl        = null;
            string   serializedOutputFileOrUrl = null;
            string   textOutputFileOrUrl       = null;
            string   treebankPath = null;
            Treebank testTreebank = null;
            // Treebank tuneTreebank = null;
            string      testPath    = null;
            IFileFilter testFilter  = null;
            IFileFilter trainFilter = null;
            string      encoding    = null;
            // variables needed to process the files to be parsed
            ITokenizerFactory <Word> tokenizerFactory = null;
            //    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor();
            bool tokenized = false;
            // whether or not the input file has already been tokenized
            IFunction <IList <IHasWord>, IList <IHasWord> > escaper = new ChineseEscaper();
            // int tagDelimiter = -1;
            // String sentenceDelimiter = "\n";
            // boolean fromXML = false;
            int argIndex = 0;

            if (args.Length < 1)
            {
                log.Info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*");
                return;
            }
            Options op = new Options();

            op.tlpParams = new ChineseTreebankParserParams();
            // while loop through option arguments
            while (argIndex < args.Length && args[argIndex][0] == '-')
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train"))
                {
                    train = true;
                    saveToSerializedFile = true;
                    int numSubArgs = NumSubArgs(args, argIndex);
                    argIndex++;
                    if (numSubArgs > 1)
                    {
                        treebankPath = args[argIndex];
                        argIndex++;
                    }
                    else
                    {
                        throw new Exception("Error: -train option must have treebankPath as first argument.");
                    }
                    if (numSubArgs == 2)
                    {
                        trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
                    }
                    else
                    {
                        if (numSubArgs >= 3)
                        {
                            try
                            {
                                int low  = System.Convert.ToInt32(args[argIndex]);
                                int high = System.Convert.ToInt32(args[argIndex + 1]);
                                trainFilter = new NumberRangeFileFilter(low, high, true);
                                argIndex   += 2;
                            }
                            catch (NumberFormatException)
                            {
                                // maybe it's a ranges expression?
                                trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                                argIndex++;
                            }
                        }
                    }
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-encoding"))
                    {
                        // sets encoding for TreebankLangParserParams
                        encoding = args[argIndex + 1];
                        op.tlpParams.SetInputEncoding(encoding);
                        op.tlpParams.SetOutputEncoding(encoding);
                        argIndex += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-loadFromSerializedFile"))
                        {
                            // load the parser from a binary serialized file
                            // the next argument must be the path to the parser file
                            serializedInputFileOrUrl = args[argIndex + 1];
                            argIndex += 2;
                        }
                        else
                        {
                            // doesn't make sense to load from TextFile -pichuan
                            //      } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
                            //        // load the parser from declarative text file
                            //        // the next argument must be the path to the parser file
                            //        textInputFileOrUrl = args[argIndex + 1];
                            //        argIndex += 2;
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToSerializedFile"))
                            {
                                saveToSerializedFile      = true;
                                serializedOutputFileOrUrl = args[argIndex + 1];
                                argIndex += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToTextFile"))
                                {
                                    // save the parser to declarative text file
                                    saveToTextFile      = true;
                                    textOutputFileOrUrl = args[argIndex + 1];
                                    argIndex           += 2;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                                    {
                                        // the next argument is the treebank path and range for testing
                                        int numSubArgs = NumSubArgs(args, argIndex);
                                        argIndex++;
                                        if (numSubArgs == 1)
                                        {
                                            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                        }
                                        else
                                        {
                                            if (numSubArgs > 1)
                                            {
                                                testPath = args[argIndex++];
                                                if (numSubArgs == 2)
                                                {
                                                    testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                                }
                                                else
                                                {
                                                    if (numSubArgs >= 3)
                                                    {
                                                        try
                                                        {
                                                            int low  = System.Convert.ToInt32(args[argIndex]);
                                                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                                                            testFilter = new NumberRangeFileFilter(low, high, true);
                                                            argIndex  += 2;
                                                        }
                                                        catch (NumberFormatException)
                                                        {
                                                            // maybe it's a ranges expression?
                                                            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                    else
                                    {
                                        int j = op.tlpParams.SetOptionFlag(args, argIndex);
                                        if (j == argIndex)
                                        {
                                            log.Info("Unknown option ignored: " + args[argIndex]);
                                            j++;
                                        }
                                        argIndex = j;
                                    }
                                }
                            }
                        }
                    }
                }
            }
            // end while loop through arguments
            ITreebankLangParserParams tlpParams = op.tlpParams;

            // all other arguments are order dependent and
            // are processed in order below
            Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null;
            if (!train && op.testOptions.verbose)
            {
                System.Console.Out.WriteLine("Currently " + new DateTime());
                PrintArgs(args, System.Console.Out);
            }
            if (train)
            {
                PrintArgs(args, System.Console.Out);
                // so we train a parser using the treebank
                if (treebankPath == null)
                {
                    // the next arg must be the treebank path, since it wasn't give earlier
                    treebankPath = args[argIndex];
                    argIndex++;
                    if (args.Length > argIndex + 1)
                    {
                        try
                        {
                            // the next two args might be the range
                            int low  = System.Convert.ToInt32(args[argIndex]);
                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                            trainFilter = new NumberRangeFileFilter(low, high, true);
                            argIndex   += 2;
                        }
                        catch (NumberFormatException)
                        {
                            // maybe it's a ranges expression?
                            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                            argIndex++;
                        }
                    }
                }
                Treebank        trainTreebank = MakeTreebank(treebankPath, op, trainFilter);
                IIndex <string> wordIndex     = new HashIndex <string>();
                IIndex <string> tagIndex      = new HashIndex <string>();
                cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
            }
            else
            {
                if (textInputFileOrUrl != null)
                {
                }
                else
                {
                    // so we load the segmenter from a text grammar file
                    // XXXXX fix later -pichuan
                    //cs = new LexicalizedParser(textInputFileOrUrl, true, op);
                    // so we load a serialized segmenter
                    if (serializedInputFileOrUrl == null)
                    {
                        // the next argument must be the path to the serialized parser
                        serializedInputFileOrUrl = args[argIndex];
                        argIndex++;
                    }
                    try
                    {
                        cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
                    }
                    catch (ArgumentException)
                    {
                        log.Info("Error loading segmenter, exiting...");
                        System.Environment.Exit(0);
                    }
                }
            }
            // the following has to go after reading parser to make sure
            // op and tlpParams are the same for train and test
            TreePrint treePrint = op.testOptions.TreePrint(tlpParams);

            if (testFilter != null)
            {
                if (testPath == null)
                {
                    if (treebankPath == null)
                    {
                        throw new Exception("No test treebank path specified...");
                    }
                    else
                    {
                        log.Info("No test treebank path specified.  Using train path: \"" + treebankPath + "\"");
                        testPath = treebankPath;
                    }
                }
                testTreebank = tlpParams.TestMemoryTreebank();
                testTreebank.LoadPath(testPath, testFilter);
            }
            op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(tlpParams.SisterSplitters()));
            // at this point we should be sure that op.tlpParams is
            // set appropriately (from command line, or from grammar file),
            // and will never change again.  We also set the tlpParams of the
            // LexicalizedParser instance to be the same object.  This is
            // redundancy that we probably should take out eventually.
            //
            // -- Roger
            if (op.testOptions.verbose)
            {
                log.Info("Lexicon is " + cs.GetType().FullName);
            }
            PrintWriter pwOut = tlpParams.Pw();
            PrintWriter pwErr = tlpParams.Pw(System.Console.Error);

            // Now what do we do with the parser we've made
            if (saveToTextFile)
            {
                // save the parser to textGrammar format
                if (textOutputFileOrUrl != null)
                {
                    SaveSegmenterDataToText(cs, textOutputFileOrUrl);
                }
                else
                {
                    log.Info("Usage: must specify a text segmenter data output path");
                }
            }
            if (saveToSerializedFile)
            {
                if (serializedOutputFileOrUrl == null && argIndex < args.Length)
                {
                    // the next argument must be the path to serialize to
                    serializedOutputFileOrUrl = args[argIndex];
                    argIndex++;
                }
                if (serializedOutputFileOrUrl != null)
                {
                    SaveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
                }
                else
                {
                    if (textOutputFileOrUrl == null && testTreebank == null)
                    {
                        // no saving/parsing request has been specified
                        log.Info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
                    }
                }
            }
            /* --------------------- Testing part!!!! ----------------------- */
            if (op.testOptions.verbose)
            {
            }
            //      printOptions(false, op);
            if (testTreebank != null || (argIndex < args.Length && Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")))
            {
                // test parser on treebank
                if (testTreebank == null)
                {
                    // the next argument is the treebank path and range for testing
                    testTreebank = tlpParams.TestMemoryTreebank();
                    if (args.Length < argIndex + 4)
                    {
                        testTreebank.LoadPath(args[argIndex + 1]);
                    }
                    else
                    {
                        int testlow  = System.Convert.ToInt32(args[argIndex + 2]);
                        int testhigh = System.Convert.ToInt32(args[argIndex + 3]);
                        testTreebank.LoadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
                    }
                }
            }
        }
Пример #7
0
        private static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromTreebank(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex)
        {
            System.Console.Out.WriteLine("Currently " + new DateTime());
            //    printOptions(true, op);
            Timing.StartTime();
            // setup tree transforms
            ITreebankLangParserParams tlpParams = op.tlpParams;

            if (op.testOptions.verbose)
            {
                System.Console.Out.Write("Training ");
                System.Console.Out.WriteLine(trainTreebank.TextualSummary());
            }
            System.Console.Out.Write("Binarizing trees...");
            TreeAnnotatorAndBinarizer binarizer;

            // initialized below
            if (!op.trainOptions.leftToRight)
            {
                binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
            }
            else
            {
                binarizer = new TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
            }
            CollinsPuncTransformer collinsPuncTransformer = null;

            if (op.trainOptions.collinsPunc)
            {
                collinsPuncTransformer = new CollinsPuncTransformer(tlpParams.TreebankLanguagePack());
            }
            IList <Tree> binaryTrainTrees = new List <Tree>();

            // List<Tree> binaryTuneTrees = new ArrayList<Tree>();
            if (op.trainOptions.selectiveSplit)
            {
                op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, true, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlpParams.TreebankLanguagePack());
                if (op.testOptions.verbose)
                {
                    log.Info("Parent split categories: " + op.trainOptions.splitters);
                }
            }
            if (op.trainOptions.selectivePostSplit)
            {
                ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op);
                Treebank         annotatedTB   = trainTreebank.Transform(myTransformer);
                op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlpParams.TreebankLanguagePack());
                if (op.testOptions.verbose)
                {
                    log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters);
                }
            }
            if (op.trainOptions.hSelSplit)
            {
                binarizer.SetDoSelectiveSplit(false);
                foreach (Tree tree in trainTreebank)
                {
                    if (op.trainOptions.collinsPunc)
                    {
                        tree = collinsPuncTransformer.TransformTree(tree);
                    }
                    tree = binarizer.TransformTree(tree);
                }
                binarizer.SetDoSelectiveSplit(true);
            }
            foreach (Tree tree_1 in trainTreebank)
            {
                if (op.trainOptions.collinsPunc)
                {
                    tree_1 = collinsPuncTransformer.TransformTree(tree_1);
                }
                tree_1 = binarizer.TransformTree(tree_1);
                binaryTrainTrees.Add(tree_1);
            }
            Timing.Tick("done.");
            if (op.testOptions.verbose)
            {
                binarizer.DumpStats();
            }
            System.Console.Out.Write("Extracting Lexicon...");
            Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter clex = (Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter)op.tlpParams.Lex(op, wordIndex, tagIndex);
            clex.InitializeTraining(binaryTrainTrees.Count);
            clex.Train(binaryTrainTrees);
            clex.FinishTraining();
            Timing.Tick("done.");
            return(clex);
        }
Пример #8
0
 private ChineseLexiconAndWordSegmenter(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex)
 {
     Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = GetSegmenterDataFromTreebank(trainTreebank, op, wordIndex, tagIndex);
     chineseLexicon = cs.chineseLexicon;
     wordSegmenter  = cs.wordSegmenter;
 }