/// <exception cref="System.IO.IOException"/> public TestClassifier(MaxentTagger maxentTagger, string testFile) { // TODO: can we break this class up in some way? Perhaps we can // spread some functionality into TestSentence and some into MaxentTagger // TODO: at the very least, it doesn't seem to make sense to make it // an object with state, rather than just some static methods // TODO: only one boolean here instead of 4? They all use the same // debug status this.maxentTagger = maxentTagger; this.config = maxentTagger.config; SetDebug(config.GetDebug()); fileRecord = TaggedFileRecord.CreateRecord(config, testFile); saveRoot = config.GetDebugPrefix(); if (saveRoot == null || saveRoot.Equals(string.Empty)) { saveRoot = fileRecord.Filename(); } Test(); if (writeConfusionMatrix) { PrintFile pf = new PrintFile(saveRoot + ".confusion"); pf.Write(confusionMatrix.ToString()); pf.Close(); } }
// end static class NERClient /// <summary>Starts this server on the specified port.</summary> /// <remarks> /// Starts this server on the specified port. The classifier used can be /// either a default one stored in the jar file from which this code is /// invoked or you can specify it as a filename or as another classifier /// resource name, which must correspond to the name of a resource in the /// /classifiers/ directory of the jar file. /// <p> /// Usage: <code>java edu.stanford.nlp.tagger.maxent.MaxentTaggerServer [-model file|-client] -port portNumber [other MaxentTagger options]</code> /// </remarks> /// <param name="args">Command-line arguments (described above)</param> /// <exception cref="System.Exception">If file or Java class problems with serialized classifier</exception> public static void Main(string[] args) { if (args.Length == 0) { log.Info(Usage); return; } // Use both Properties and TaggerConfig. It's okay. Properties props = StringUtils.ArgsToProperties(args); string client = props.GetProperty("client"); string portStr = props.GetProperty("port"); if (portStr == null || portStr.Equals(string.Empty)) { log.Info(Usage); return; } int port = 0; try { port = System.Convert.ToInt32(portStr); } catch (NumberFormatException) { log.Info("Non-numerical port"); log.Info(Usage); System.Environment.Exit(1); } if (client != null && !client.Equals(string.Empty)) { // run a test client for illustration/testing string host = props.GetProperty("host"); string encoding = props.GetProperty("encoding"); if (encoding == null || string.Empty.Equals(encoding)) { encoding = "utf-8"; } MaxentTaggerServer.TaggerClient.CommunicateWithMaxentTaggerServer(host, port, encoding); } else { TaggerConfig config = new TaggerConfig(args); MaxentTagger tagger = new MaxentTagger(config.GetModel(), config); // initializes tagger MaxentTagger.TaggerWrapper wrapper = new MaxentTagger.TaggerWrapper(tagger); new MaxentTaggerServer(port, wrapper, config.GetEncoding()).Run(); } }
protected internal ReadDataTagged(TaggerConfig config, MaxentTagger maxentTagger, PairsHolder pairs) { //TODO: make a class DataHolder that holds the dict, tags, pairs, etc, for tagger and pass it around this.maxentTagger = maxentTagger; this.pairs = pairs; IList <TaggedFileRecord> fileRecords = TaggedFileRecord.CreateRecords(config, config.GetFile()); IDictionary <string, IntCounter <string> > wordTagCounts = Generics.NewHashMap(); foreach (TaggedFileRecord record in fileRecords) { LoadFile(record.Reader(), wordTagCounts); } // By counting the words and then filling the Dictionary, we can // make it so there are no calls that mutate the Dictionary or its // TagCount objects later maxentTagger.dict.FillWordTagCounts(wordTagCounts); }
/// <summary>This method gets feature statistics from a training file found in the TaggerConfig.</summary> /// <remarks> /// This method gets feature statistics from a training file found in the TaggerConfig. /// It is the start of the training process. /// </remarks> /// <exception cref="System.IO.IOException"/> protected internal TaggerExperiments(TaggerConfig config, MaxentTagger maxentTagger) : this(maxentTagger) { log.Info("TaggerExperiments: adding word/tags"); PairsHolder pairs = new PairsHolder(); ReadDataTagged c = new ReadDataTagged(config, maxentTagger, pairs); vArray = new int[][] { }; InitTemplatesNew(); log.Info("Featurizing tagged data tokens..."); for (int i = 0; i < size; i++) { DataWordTag d = c.Get(i); string yS = d.GetY(); History h = d.GetHistory(); int indX = tHistories.Add(h); int indY = d.GetYInd(); AddTemplatesNew(h, yS); AddRareTemplatesNew(h, yS); vArray[i][0] = indX; vArray[i][1] = indY; } // It's the 2010s now and it doesn't take so long to featurize.... // if (i > 0 && (i % 10000) == 0) { // System.err.printf("%d ", i); // if (i % 100000 == 0) { System.err.println(); } // } // log.info(); log.Info("Featurized " + c.GetSize() + " data tokens [done]."); c.Release(); Ptilde(); maxentTagger.xSize = xSize; maxentTagger.ySize = ySize; log.Info("xSize [num Phi templates] = " + xSize + "; ySize [num classes] = " + ySize); HashHistories(); // if we'll look at occurring tags only, we need the histories and pairs still if (!maxentTagger.occurringTagsOnly && !maxentTagger.possibleTagsOnly) { tHistories.Release(); pairs.Clear(); } GetFeaturesNew(); }
public TestSentence(MaxentTagger maxentTagger) { // origWords is only set when run with a list of HasWords; when run // with a list of strings, this will be null // TODO this always has the value of sent.size(). Remove it? [cdm 2008] // protected double[][][] probabilities; // = 0; System.Diagnostics.Debug.Assert((maxentTagger != null)); System.Diagnostics.Debug.Assert((maxentTagger.GetLambdaSolve() != null)); this.maxentTagger = maxentTagger; if (maxentTagger.config != null) { tagSeparator = maxentTagger.config.GetTagSeparator(); encoding = maxentTagger.config.GetEncoding(); Verbose = maxentTagger.config.GetVerbose(); } else { tagSeparator = TaggerConfig.GetDefaultTagSeparator(); encoding = "utf-8"; Verbose = false; } history = new History(pairs, maxentTagger.extractors); }