예제 #1
0
 /// <exception cref="System.IO.IOException"/>
 public TestClassifier(MaxentTagger maxentTagger, string testFile)
 {
     // TODO: can we break this class up in some way?  Perhaps we can
     // spread some functionality into TestSentence and some into MaxentTagger
     // TODO: at the very least, it doesn't seem to make sense to make it
     // an object with state, rather than just some static methods
     // TODO: only one boolean here instead of 4?  They all use the same
     // debug status
     this.maxentTagger = maxentTagger;
     this.config       = maxentTagger.config;
     SetDebug(config.GetDebug());
     fileRecord = TaggedFileRecord.CreateRecord(config, testFile);
     saveRoot   = config.GetDebugPrefix();
     if (saveRoot == null || saveRoot.Equals(string.Empty))
     {
         saveRoot = fileRecord.Filename();
     }
     Test();
     if (writeConfusionMatrix)
     {
         PrintFile pf = new PrintFile(saveRoot + ".confusion");
         pf.Write(confusionMatrix.ToString());
         pf.Close();
     }
 }
예제 #2
0
        // end static class NERClient
        /// <summary>Starts this server on the specified port.</summary>
        /// <remarks>
        /// Starts this server on the specified port.  The classifier used can be
        /// either a default one stored in the jar file from which this code is
        /// invoked or you can specify it as a filename or as another classifier
        /// resource name, which must correspond to the name of a resource in the
        /// /classifiers/ directory of the jar file.
        /// <p>
        /// Usage: <code>java edu.stanford.nlp.tagger.maxent.MaxentTaggerServer [-model file|-client] -port portNumber [other MaxentTagger options]</code>
        /// </remarks>
        /// <param name="args">Command-line arguments (described above)</param>
        /// <exception cref="System.Exception">If file or Java class problems with serialized classifier</exception>
        public static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                log.Info(Usage);
                return;
            }
            // Use both Properties and TaggerConfig.  It's okay.
            Properties props   = StringUtils.ArgsToProperties(args);
            string     client  = props.GetProperty("client");
            string     portStr = props.GetProperty("port");

            if (portStr == null || portStr.Equals(string.Empty))
            {
                log.Info(Usage);
                return;
            }
            int port = 0;

            try
            {
                port = System.Convert.ToInt32(portStr);
            }
            catch (NumberFormatException)
            {
                log.Info("Non-numerical port");
                log.Info(Usage);
                System.Environment.Exit(1);
            }
            if (client != null && !client.Equals(string.Empty))
            {
                // run a test client for illustration/testing
                string host     = props.GetProperty("host");
                string encoding = props.GetProperty("encoding");
                if (encoding == null || string.Empty.Equals(encoding))
                {
                    encoding = "utf-8";
                }
                MaxentTaggerServer.TaggerClient.CommunicateWithMaxentTaggerServer(host, port, encoding);
            }
            else
            {
                TaggerConfig config = new TaggerConfig(args);
                MaxentTagger tagger = new MaxentTagger(config.GetModel(), config);
                // initializes tagger
                MaxentTagger.TaggerWrapper wrapper = new MaxentTagger.TaggerWrapper(tagger);
                new MaxentTaggerServer(port, wrapper, config.GetEncoding()).Run();
            }
        }
예제 #3
0
        protected internal ReadDataTagged(TaggerConfig config, MaxentTagger maxentTagger, PairsHolder pairs)
        {
            //TODO: make a class DataHolder that holds the dict, tags, pairs, etc, for tagger and pass it around
            this.maxentTagger = maxentTagger;
            this.pairs        = pairs;
            IList <TaggedFileRecord> fileRecords = TaggedFileRecord.CreateRecords(config, config.GetFile());
            IDictionary <string, IntCounter <string> > wordTagCounts = Generics.NewHashMap();

            foreach (TaggedFileRecord record in fileRecords)
            {
                LoadFile(record.Reader(), wordTagCounts);
            }
            // By counting the words and then filling the Dictionary, we can
            // make it so there are no calls that mutate the Dictionary or its
            // TagCount objects later
            maxentTagger.dict.FillWordTagCounts(wordTagCounts);
        }
 /// <summary>This method gets feature statistics from a training file found in the TaggerConfig.</summary>
 /// <remarks>
 /// This method gets feature statistics from a training file found in the TaggerConfig.
 /// It is the start of the training process.
 /// </remarks>
 /// <exception cref="System.IO.IOException"/>
 protected internal TaggerExperiments(TaggerConfig config, MaxentTagger maxentTagger)
     : this(maxentTagger)
 {
     log.Info("TaggerExperiments: adding word/tags");
     PairsHolder    pairs = new PairsHolder();
     ReadDataTagged c     = new ReadDataTagged(config, maxentTagger, pairs);
     vArray = new int[][] {  };
     InitTemplatesNew();
     log.Info("Featurizing tagged data tokens...");
     for (int i = 0; i < size; i++)
     {
         DataWordTag d    = c.Get(i);
         string      yS   = d.GetY();
         History     h    = d.GetHistory();
         int         indX = tHistories.Add(h);
         int         indY = d.GetYInd();
         AddTemplatesNew(h, yS);
         AddRareTemplatesNew(h, yS);
         vArray[i][0] = indX;
         vArray[i][1] = indY;
     }
     // It's the 2010s now and it doesn't take so long to featurize....
     // if (i > 0 && (i % 10000) == 0) {
     //   System.err.printf("%d ", i);
     //   if (i % 100000 == 0) { System.err.println(); }
     // }
     // log.info();
     log.Info("Featurized " + c.GetSize() + " data tokens [done].");
     c.Release();
     Ptilde();
     maxentTagger.xSize = xSize;
     maxentTagger.ySize = ySize;
     log.Info("xSize [num Phi templates] = " + xSize + "; ySize [num classes] = " + ySize);
     HashHistories();
     // if we'll look at occurring tags only, we need the histories and pairs still
     if (!maxentTagger.occurringTagsOnly && !maxentTagger.possibleTagsOnly)
     {
         tHistories.Release();
         pairs.Clear();
     }
     GetFeaturesNew();
 }
예제 #5
0
 public TestSentence(MaxentTagger maxentTagger)
 {
     // origWords is only set when run with a list of HasWords; when run
     // with a list of strings, this will be null
     // TODO this always has the value of sent.size(). Remove it? [cdm 2008]
     // protected double[][][] probabilities;
     // = 0;
     System.Diagnostics.Debug.Assert((maxentTagger != null));
     System.Diagnostics.Debug.Assert((maxentTagger.GetLambdaSolve() != null));
     this.maxentTagger = maxentTagger;
     if (maxentTagger.config != null)
     {
         tagSeparator = maxentTagger.config.GetTagSeparator();
         encoding     = maxentTagger.config.GetEncoding();
         Verbose      = maxentTagger.config.GetVerbose();
     }
     else
     {
         tagSeparator = TaggerConfig.GetDefaultTagSeparator();
         encoding     = "utf-8";
         Verbose      = false;
     }
     history = new History(pairs, maxentTagger.extractors);
 }