public static void Annotate(GraphicalModel model, IList <string> tags, ConcatVectorNamespace @namespace, IDictionary <string, double[]> embeddings) { for (int i = 0; i < model.variableMetaData.Count; i++) { IDictionary <string, string> metadata = model.GetVariableMetaDataByReference(i); string token = metadata["TOKEN"]; string pos = metadata["POS"]; string chunk = metadata["CHUNK"]; IDictionary <string, string> leftMetadata = null; if (i > 0) { leftMetadata = model.GetVariableMetaDataByReference(i - 1); } string leftToken = (leftMetadata == null) ? "^" : leftMetadata["TOKEN"]; string leftPos = (leftMetadata == null) ? "^" : leftMetadata["POS"]; string leftChunk = (leftMetadata == null) ? "^" : leftMetadata["CHUNK"]; IDictionary <string, string> rightMetadata = null; if (i < model.variableMetaData.Count - 1) { rightMetadata = model.GetVariableMetaDataByReference(i + 1); } string rightToken = (rightMetadata == null) ? "$" : rightMetadata["TOKEN"]; string rightPos = (rightMetadata == null) ? "$" : rightMetadata["POS"]; string rightChunk = (rightMetadata == null) ? "$" : rightMetadata["CHUNK"]; // Add the unary factor GraphicalModel.Factor f = model.AddFactor(new int[] { i }, new int[] { tags.Count }, null); // This is the anonymous function that generates a feature vector for each assignment to the unary // factor System.Diagnostics.Debug.Assert((f.neigborIndices.Length == 1)); System.Diagnostics.Debug.Assert((f.neigborIndices[0] == i)); // If this is not the last variable, add a binary factor if (i < model.variableMetaData.Count - 1) { GraphicalModel.Factor jf = model.AddFactor(new int[] { i, i + 1 }, new int[] { tags.Count, tags.Count }, null); // This is the anonymous function that generates a feature vector for every joint assignment to the // binary factor System.Diagnostics.Debug.Assert((jf.neigborIndices.Length == 2)); System.Diagnostics.Debug.Assert((jf.neigborIndices[0] == i)); System.Diagnostics.Debug.Assert((jf.neigborIndices[1] == i + 1)); } } }
public virtual GraphicalModel GenerateSentenceModel(ConcatVectorNamespace @namespace, CoNLLBenchmark.CoNLLSentence sentence, IList <string> tags) { GraphicalModel model = new GraphicalModel(); for (int i = 0; i < sentence.token.Count; i++) { // Add the training label IDictionary <string, string> metadata = model.GetVariableMetaDataByReference(i); metadata[LogLikelihoodDifferentiableFunction.VariableTrainingValue] = string.Empty + tags.IndexOf(sentence.ner[i]); metadata["TOKEN"] = string.Empty + sentence.token[i]; metadata["POS"] = string.Empty + sentence.pos[i]; metadata["CHUNK"] = string.Empty + sentence.npchunk[i]; metadata["TAG"] = string.Empty + sentence.ner[i]; } CoNLLFeaturizer.Annotate(model, tags, @namespace, embeddings); System.Diagnostics.Debug.Assert((model.factors != null)); foreach (GraphicalModel.Factor f in model.factors) { System.Diagnostics.Debug.Assert((f != null)); } return(model); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public static void Main(string[] args) { ////////////////////////////////////////////////////////////// // Generate the CoNLL CliqueTrees to use during gameplay ////////////////////////////////////////////////////////////// CoNLLBenchmark coNLL = new CoNLLBenchmark(); IList <CoNLLBenchmark.CoNLLSentence> train = coNLL.GetSentences(DataPath + "conll.iob.4class.train"); IList <CoNLLBenchmark.CoNLLSentence> testA = coNLL.GetSentences(DataPath + "conll.iob.4class.testa"); IList <CoNLLBenchmark.CoNLLSentence> testB = coNLL.GetSentences(DataPath + "conll.iob.4class.testb"); IList <CoNLLBenchmark.CoNLLSentence> allData = new List <CoNLLBenchmark.CoNLLSentence>(); Sharpen.Collections.AddAll(allData, train); Sharpen.Collections.AddAll(allData, testA); Sharpen.Collections.AddAll(allData, testB); ICollection <string> tagsSet = new HashSet <string>(); foreach (CoNLLBenchmark.CoNLLSentence sentence in allData) { foreach (string nerTag in sentence.ner) { tagsSet.Add(nerTag); } } IList <string> tags = new List <string>(); Sharpen.Collections.AddAll(tags, tagsSet); coNLL.embeddings = coNLL.GetEmbeddings(DataPath + "google-300-trimmed.ser.gz", allData); log.Info("Making the training set..."); ConcatVectorNamespace @namespace = new ConcatVectorNamespace(); int trainSize = train.Count; GraphicalModel[] trainingSet = new GraphicalModel[trainSize]; for (int i = 0; i < trainSize; i++) { if (i % 10 == 0) { log.Info(i + "/" + trainSize); } trainingSet[i] = coNLL.GenerateSentenceModel(@namespace, train[i], tags); } ////////////////////////////////////////////////////////////// // Generate the random human observation feature vectors that we'll use ////////////////////////////////////////////////////////////// Random r = new Random(10); int numFeatures = 5; int featureLength = 30; ConcatVector[] humanFeatureVectors = new ConcatVector[1000]; for (int i_1 = 0; i_1 < humanFeatureVectors.Length; i_1++) { humanFeatureVectors[i_1] = new ConcatVector(numFeatures); for (int j = 0; j < numFeatures; j++) { if (r.NextBoolean()) { humanFeatureVectors[i_1].SetSparseComponent(j, r.NextInt(featureLength), r.NextDouble()); } else { double[] dense = new double[featureLength]; for (int k = 0; k < dense.Length; k++) { dense[k] = r.NextDouble(); } humanFeatureVectors[i_1].SetDenseComponent(j, dense); } } } ConcatVector weights = new ConcatVector(numFeatures); for (int i_2 = 0; i_2 < numFeatures; i_2++) { double[] dense = new double[featureLength]; for (int j = 0; j < dense.Length; j++) { dense[j] = r.NextDouble(); } weights.SetDenseComponent(i_2, dense); } ////////////////////////////////////////////////////////////// // Actually perform gameplay-like random mutations ////////////////////////////////////////////////////////////// log.Info("Warming up the JIT..."); for (int i_3 = 0; i_3 < 10; i_3++) { log.Info(i_3); Gameplay(r, trainingSet[i_3], weights, humanFeatureVectors); } log.Info("Timing actual run..."); long start = Runtime.CurrentTimeMillis(); for (int i_4 = 0; i_4 < 10; i_4++) { log.Info(i_4); Gameplay(r, trainingSet[i_4], weights, humanFeatureVectors); } long duration = Runtime.CurrentTimeMillis() - start; log.Info("Duration: " + duration); }
/// <exception cref="System.Exception"/> public virtual void BenchmarkOptimizer() { IList <CoNLLBenchmark.CoNLLSentence> train = GetSentences(DataPath + "conll.iob.4class.train"); IList <CoNLLBenchmark.CoNLLSentence> testA = GetSentences(DataPath + "conll.iob.4class.testa"); IList <CoNLLBenchmark.CoNLLSentence> testB = GetSentences(DataPath + "conll.iob.4class.testb"); IList <CoNLLBenchmark.CoNLLSentence> allData = new List <CoNLLBenchmark.CoNLLSentence>(); Sharpen.Collections.AddAll(allData, train); Sharpen.Collections.AddAll(allData, testA); Sharpen.Collections.AddAll(allData, testB); ICollection <string> tagsSet = new HashSet <string>(); foreach (CoNLLBenchmark.CoNLLSentence sentence in allData) { foreach (string nerTag in sentence.ner) { tagsSet.Add(nerTag); } } IList <string> tags = new List <string>(); Sharpen.Collections.AddAll(tags, tagsSet); embeddings = GetEmbeddings(DataPath + "google-300-trimmed.ser.gz", allData); log.Info("Making the training set..."); ConcatVectorNamespace @namespace = new ConcatVectorNamespace(); int trainSize = train.Count; GraphicalModel[] trainingSet = new GraphicalModel[trainSize]; for (int i = 0; i < trainSize; i++) { if (i % 10 == 0) { log.Info(i + "/" + trainSize); } trainingSet[i] = GenerateSentenceModel(@namespace, train[i], tags); } log.Info("Training system..."); AbstractBatchOptimizer opt = new BacktrackingAdaGradOptimizer(); // This training call is basically what we want the benchmark for. It should take 99% of the wall clock time ConcatVector weights = opt.Optimize(trainingSet, new LogLikelihoodDifferentiableFunction(), @namespace.NewWeightsVector(), 0.01, 1.0e-5, false); log.Info("Testing system..."); // Evaluation method lifted from the CoNLL 2004 perl script IDictionary <string, double> correctChunk = new Dictionary <string, double>(); IDictionary <string, double> foundCorrect = new Dictionary <string, double>(); IDictionary <string, double> foundGuessed = new Dictionary <string, double>(); double correct = 0.0; double total = 0.0; foreach (CoNLLBenchmark.CoNLLSentence sentence_1 in testA) { GraphicalModel model = GenerateSentenceModel(@namespace, sentence_1, tags); int[] guesses = new CliqueTree(model, weights).CalculateMAP(); string[] nerGuesses = new string[guesses.Length]; for (int i_1 = 0; i_1 < guesses.Length; i_1++) { nerGuesses[i_1] = tags[guesses[i_1]]; if (nerGuesses[i_1].Equals(sentence_1.ner[i_1])) { correct++; correctChunk[nerGuesses[i_1]] = correctChunk.GetOrDefault(nerGuesses[i_1], 0.0) + 1; } total++; foundCorrect[sentence_1.ner[i_1]] = foundCorrect.GetOrDefault(sentence_1.ner[i_1], 0.0) + 1; foundGuessed[nerGuesses[i_1]] = foundGuessed.GetOrDefault(nerGuesses[i_1], 0.0) + 1; } } log.Info("\nSystem results:\n"); log.Info("Accuracy: " + (correct / total) + "\n"); foreach (string tag in tags) { double precision = foundGuessed.GetOrDefault(tag, 0.0) == 0 ? 0.0 : correctChunk.GetOrDefault(tag, 0.0) / foundGuessed[tag]; double recall = foundCorrect.GetOrDefault(tag, 0.0) == 0 ? 0.0 : correctChunk.GetOrDefault(tag, 0.0) / foundCorrect[tag]; double f1 = (precision + recall == 0.0) ? 0.0 : (precision * recall * 2) / (precision + recall); log.Info(tag + " (" + foundCorrect.GetOrDefault(tag, 0.0) + ")"); log.Info("\tP:" + precision + " (" + correctChunk.GetOrDefault(tag, 0.0) + "/" + foundGuessed.GetOrDefault(tag, 0.0) + ")"); log.Info("\tR:" + recall + " (" + correctChunk.GetOrDefault(tag, 0.0) + "/" + foundCorrect.GetOrDefault(tag, 0.0) + ")"); log.Info("\tF1:" + f1); } }