示例#1
0
 public static void Annotate(GraphicalModel model, IList <string> tags, ConcatVectorNamespace @namespace, IDictionary <string, double[]> embeddings)
 {
     for (int i = 0; i < model.variableMetaData.Count; i++)
     {
         IDictionary <string, string> metadata = model.GetVariableMetaDataByReference(i);
         string token = metadata["TOKEN"];
         string pos   = metadata["POS"];
         string chunk = metadata["CHUNK"];
         IDictionary <string, string> leftMetadata = null;
         if (i > 0)
         {
             leftMetadata = model.GetVariableMetaDataByReference(i - 1);
         }
         string leftToken = (leftMetadata == null) ? "^" : leftMetadata["TOKEN"];
         string leftPos   = (leftMetadata == null) ? "^" : leftMetadata["POS"];
         string leftChunk = (leftMetadata == null) ? "^" : leftMetadata["CHUNK"];
         IDictionary <string, string> rightMetadata = null;
         if (i < model.variableMetaData.Count - 1)
         {
             rightMetadata = model.GetVariableMetaDataByReference(i + 1);
         }
         string rightToken = (rightMetadata == null) ? "$" : rightMetadata["TOKEN"];
         string rightPos   = (rightMetadata == null) ? "$" : rightMetadata["POS"];
         string rightChunk = (rightMetadata == null) ? "$" : rightMetadata["CHUNK"];
         // Add the unary factor
         GraphicalModel.Factor f = model.AddFactor(new int[] { i }, new int[] { tags.Count }, null);
         // This is the anonymous function that generates a feature vector for each assignment to the unary
         // factor
         System.Diagnostics.Debug.Assert((f.neigborIndices.Length == 1));
         System.Diagnostics.Debug.Assert((f.neigborIndices[0] == i));
         // If this is not the last variable, add a binary factor
         if (i < model.variableMetaData.Count - 1)
         {
             GraphicalModel.Factor jf = model.AddFactor(new int[] { i, i + 1 }, new int[] { tags.Count, tags.Count }, null);
             // This is the anonymous function that generates a feature vector for every joint assignment to the
             // binary factor
             System.Diagnostics.Debug.Assert((jf.neigborIndices.Length == 2));
             System.Diagnostics.Debug.Assert((jf.neigborIndices[0] == i));
             System.Diagnostics.Debug.Assert((jf.neigborIndices[1] == i + 1));
         }
     }
 }
        public virtual GraphicalModel GenerateSentenceModel(ConcatVectorNamespace @namespace, CoNLLBenchmark.CoNLLSentence sentence, IList <string> tags)
        {
            GraphicalModel model = new GraphicalModel();

            for (int i = 0; i < sentence.token.Count; i++)
            {
                // Add the training label
                IDictionary <string, string> metadata = model.GetVariableMetaDataByReference(i);
                metadata[LogLikelihoodDifferentiableFunction.VariableTrainingValue] = string.Empty + tags.IndexOf(sentence.ner[i]);
                metadata["TOKEN"] = string.Empty + sentence.token[i];
                metadata["POS"]   = string.Empty + sentence.pos[i];
                metadata["CHUNK"] = string.Empty + sentence.npchunk[i];
                metadata["TAG"]   = string.Empty + sentence.ner[i];
            }
            CoNLLFeaturizer.Annotate(model, tags, @namespace, embeddings);
            System.Diagnostics.Debug.Assert((model.factors != null));
            foreach (GraphicalModel.Factor f in model.factors)
            {
                System.Diagnostics.Debug.Assert((f != null));
            }
            return(model);
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            //////////////////////////////////////////////////////////////
            // Generate the CoNLL CliqueTrees to use during gameplay
            //////////////////////////////////////////////////////////////
            CoNLLBenchmark coNLL = new CoNLLBenchmark();
            IList <CoNLLBenchmark.CoNLLSentence> train   = coNLL.GetSentences(DataPath + "conll.iob.4class.train");
            IList <CoNLLBenchmark.CoNLLSentence> testA   = coNLL.GetSentences(DataPath + "conll.iob.4class.testa");
            IList <CoNLLBenchmark.CoNLLSentence> testB   = coNLL.GetSentences(DataPath + "conll.iob.4class.testb");
            IList <CoNLLBenchmark.CoNLLSentence> allData = new List <CoNLLBenchmark.CoNLLSentence>();

            Sharpen.Collections.AddAll(allData, train);
            Sharpen.Collections.AddAll(allData, testA);
            Sharpen.Collections.AddAll(allData, testB);
            ICollection <string> tagsSet = new HashSet <string>();

            foreach (CoNLLBenchmark.CoNLLSentence sentence in allData)
            {
                foreach (string nerTag in sentence.ner)
                {
                    tagsSet.Add(nerTag);
                }
            }
            IList <string> tags = new List <string>();

            Sharpen.Collections.AddAll(tags, tagsSet);
            coNLL.embeddings = coNLL.GetEmbeddings(DataPath + "google-300-trimmed.ser.gz", allData);
            log.Info("Making the training set...");
            ConcatVectorNamespace @namespace = new ConcatVectorNamespace();
            int trainSize = train.Count;

            GraphicalModel[] trainingSet = new GraphicalModel[trainSize];
            for (int i = 0; i < trainSize; i++)
            {
                if (i % 10 == 0)
                {
                    log.Info(i + "/" + trainSize);
                }
                trainingSet[i] = coNLL.GenerateSentenceModel(@namespace, train[i], tags);
            }
            //////////////////////////////////////////////////////////////
            // Generate the random human observation feature vectors that we'll use
            //////////////////////////////////////////////////////////////
            Random r             = new Random(10);
            int    numFeatures   = 5;
            int    featureLength = 30;

            ConcatVector[] humanFeatureVectors = new ConcatVector[1000];
            for (int i_1 = 0; i_1 < humanFeatureVectors.Length; i_1++)
            {
                humanFeatureVectors[i_1] = new ConcatVector(numFeatures);
                for (int j = 0; j < numFeatures; j++)
                {
                    if (r.NextBoolean())
                    {
                        humanFeatureVectors[i_1].SetSparseComponent(j, r.NextInt(featureLength), r.NextDouble());
                    }
                    else
                    {
                        double[] dense = new double[featureLength];
                        for (int k = 0; k < dense.Length; k++)
                        {
                            dense[k] = r.NextDouble();
                        }
                        humanFeatureVectors[i_1].SetDenseComponent(j, dense);
                    }
                }
            }
            ConcatVector weights = new ConcatVector(numFeatures);

            for (int i_2 = 0; i_2 < numFeatures; i_2++)
            {
                double[] dense = new double[featureLength];
                for (int j = 0; j < dense.Length; j++)
                {
                    dense[j] = r.NextDouble();
                }
                weights.SetDenseComponent(i_2, dense);
            }
            //////////////////////////////////////////////////////////////
            // Actually perform gameplay-like random mutations
            //////////////////////////////////////////////////////////////
            log.Info("Warming up the JIT...");
            for (int i_3 = 0; i_3 < 10; i_3++)
            {
                log.Info(i_3);
                Gameplay(r, trainingSet[i_3], weights, humanFeatureVectors);
            }
            log.Info("Timing actual run...");
            long start = Runtime.CurrentTimeMillis();

            for (int i_4 = 0; i_4 < 10; i_4++)
            {
                log.Info(i_4);
                Gameplay(r, trainingSet[i_4], weights, humanFeatureVectors);
            }
            long duration = Runtime.CurrentTimeMillis() - start;

            log.Info("Duration: " + duration);
        }
        /// <exception cref="System.Exception"/>
        public virtual void BenchmarkOptimizer()
        {
            IList <CoNLLBenchmark.CoNLLSentence> train   = GetSentences(DataPath + "conll.iob.4class.train");
            IList <CoNLLBenchmark.CoNLLSentence> testA   = GetSentences(DataPath + "conll.iob.4class.testa");
            IList <CoNLLBenchmark.CoNLLSentence> testB   = GetSentences(DataPath + "conll.iob.4class.testb");
            IList <CoNLLBenchmark.CoNLLSentence> allData = new List <CoNLLBenchmark.CoNLLSentence>();

            Sharpen.Collections.AddAll(allData, train);
            Sharpen.Collections.AddAll(allData, testA);
            Sharpen.Collections.AddAll(allData, testB);
            ICollection <string> tagsSet = new HashSet <string>();

            foreach (CoNLLBenchmark.CoNLLSentence sentence in allData)
            {
                foreach (string nerTag in sentence.ner)
                {
                    tagsSet.Add(nerTag);
                }
            }
            IList <string> tags = new List <string>();

            Sharpen.Collections.AddAll(tags, tagsSet);
            embeddings = GetEmbeddings(DataPath + "google-300-trimmed.ser.gz", allData);
            log.Info("Making the training set...");
            ConcatVectorNamespace @namespace = new ConcatVectorNamespace();
            int trainSize = train.Count;

            GraphicalModel[] trainingSet = new GraphicalModel[trainSize];
            for (int i = 0; i < trainSize; i++)
            {
                if (i % 10 == 0)
                {
                    log.Info(i + "/" + trainSize);
                }
                trainingSet[i] = GenerateSentenceModel(@namespace, train[i], tags);
            }
            log.Info("Training system...");
            AbstractBatchOptimizer opt = new BacktrackingAdaGradOptimizer();
            // This training call is basically what we want the benchmark for. It should take 99% of the wall clock time
            ConcatVector weights = opt.Optimize(trainingSet, new LogLikelihoodDifferentiableFunction(), @namespace.NewWeightsVector(), 0.01, 1.0e-5, false);

            log.Info("Testing system...");
            // Evaluation method lifted from the CoNLL 2004 perl script
            IDictionary <string, double> correctChunk = new Dictionary <string, double>();
            IDictionary <string, double> foundCorrect = new Dictionary <string, double>();
            IDictionary <string, double> foundGuessed = new Dictionary <string, double>();
            double correct = 0.0;
            double total   = 0.0;

            foreach (CoNLLBenchmark.CoNLLSentence sentence_1 in testA)
            {
                GraphicalModel model      = GenerateSentenceModel(@namespace, sentence_1, tags);
                int[]          guesses    = new CliqueTree(model, weights).CalculateMAP();
                string[]       nerGuesses = new string[guesses.Length];
                for (int i_1 = 0; i_1 < guesses.Length; i_1++)
                {
                    nerGuesses[i_1] = tags[guesses[i_1]];
                    if (nerGuesses[i_1].Equals(sentence_1.ner[i_1]))
                    {
                        correct++;
                        correctChunk[nerGuesses[i_1]] = correctChunk.GetOrDefault(nerGuesses[i_1], 0.0) + 1;
                    }
                    total++;
                    foundCorrect[sentence_1.ner[i_1]] = foundCorrect.GetOrDefault(sentence_1.ner[i_1], 0.0) + 1;
                    foundGuessed[nerGuesses[i_1]]     = foundGuessed.GetOrDefault(nerGuesses[i_1], 0.0) + 1;
                }
            }
            log.Info("\nSystem results:\n");
            log.Info("Accuracy: " + (correct / total) + "\n");
            foreach (string tag in tags)
            {
                double precision = foundGuessed.GetOrDefault(tag, 0.0) == 0 ? 0.0 : correctChunk.GetOrDefault(tag, 0.0) / foundGuessed[tag];
                double recall    = foundCorrect.GetOrDefault(tag, 0.0) == 0 ? 0.0 : correctChunk.GetOrDefault(tag, 0.0) / foundCorrect[tag];
                double f1        = (precision + recall == 0.0) ? 0.0 : (precision * recall * 2) / (precision + recall);
                log.Info(tag + " (" + foundCorrect.GetOrDefault(tag, 0.0) + ")");
                log.Info("\tP:" + precision + " (" + correctChunk.GetOrDefault(tag, 0.0) + "/" + foundGuessed.GetOrDefault(tag, 0.0) + ")");
                log.Info("\tR:" + recall + " (" + correctChunk.GetOrDefault(tag, 0.0) + "/" + foundCorrect.GetOrDefault(tag, 0.0) + ")");
                log.Info("\tF1:" + f1);
            }
        }