Exemple #1
0
        public static ArrayListMultiMap <FeatureVectorAsObject, int> getHPosContextCategoryMap(Corpus corpus)
        {
            ArrayListMultiMap <FeatureVectorAsObject, int> hposByFeatureVectorGroup = ArrayListMultiMap <FeatureVectorAsObject, int> .create();

            int numContexts = corpus.featureVectors.Count;

            for (int i = 0; i < numContexts; i++)
            {
                int[] X = corpus.featureVectors[i];
                int   y = corpus.hpos[i];
                hposByFeatureVectorGroup.Add(new FeatureVectorAsObject(X, Trainer.FEATURES_HPOS), y);
            }

            return(hposByFeatureVectorGroup);
        }
Exemple #2
0
        public static ArrayListMultiMap <FeatureVectorAsObject, int> getWSContextCategoryMap(Corpus corpus)
        {
            ArrayListMultiMap <FeatureVectorAsObject, int> wsByFeatureVectorGroup = ArrayListMultiMap <FeatureVectorAsObject, int> .create();

            int numContexts = corpus.featureVectors.Count;

            for (int i = 0; i < numContexts; i++)
            {
                int[] X = corpus.featureVectors[i];
                int   y = corpus.injectWhitespace[i];
                wsByFeatureVectorGroup.Add(new FeatureVectorAsObject(X, Trainer.FEATURES_INJECT_WS), y);
            }

            return(wsByFeatureVectorGroup);
        }
Exemple #3
0
        public static void runCaptureForOneLanguage(LangDescriptor language)
        {
            IList <string>        filenames = Tool.getFilenames(language.corpusDir, language.fileRegex);
            IList <InputDocument> documents = Tool.load(filenames, language);

            foreach (string fileName in filenames)
            {
                // Examine info for this file in isolation
                Corpus fileCorpus = new Corpus(fileName, language);
                fileCorpus.train();
                Console.WriteLine(fileName);
                //			examineCorpus(corpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> ws   = getWSContextCategoryMap(fileCorpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> hpos = getHPosContextCategoryMap(fileCorpus);

                // Compare with corpus minus this file
                string path = fileName;
                IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path));
                Corpus corpus = new Corpus(others, language);
                corpus.train();
                //			examineCorpus(corpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> corpus_ws   = getWSContextCategoryMap(corpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> corpus_hpos = getHPosContextCategoryMap(corpus);

                foreach (FeatureVectorAsObject x in ws.Keys)
                {
                    HashBag <int> fwsCats   = getCategoriesBag(ws[x]);
                    IList <float> fwsRatios = getCategoryRatios(fwsCats.Values);
                    HashBag <int> wsCats    = getCategoriesBag(corpus_ws[x]);
                    IList <float> wsRatios  = getCategoryRatios(wsCats.Values);
                    // compare file predictions with corpus predictions
                    if (!fwsRatios.SequenceEqual(wsRatios))
                    {
                        Console.WriteLine(fwsRatios + " vs " + wsRatios);
                    }

                    HashBag <int> fhposCats = getCategoriesBag(hpos[x]);
                    HashBag <int> hposCats  = getCategoriesBag(corpus_hpos[x]);
                }

                break;
            }
        }
Exemple #4
0
        public static void examineCorpus(Corpus corpus)
        {
            ArrayListMultiMap <FeatureVectorAsObject, int> wsByFeatureVectorGroup = ArrayListMultiMap <FeatureVectorAsObject, int> .create();

            ArrayListMultiMap <FeatureVectorAsObject, int> hposByFeatureVectorGroup = ArrayListMultiMap <FeatureVectorAsObject, int> .create();

            int numContexts = corpus.featureVectors.Count;

            for (int i = 0; i < numContexts; i++)
            {
                int[] X  = corpus.featureVectors[i];
                int   y1 = corpus.injectWhitespace[i];
                int   y2 = corpus.hpos[i];
                wsByFeatureVectorGroup.Add(new FeatureVectorAsObject(X, Trainer.FEATURES_INJECT_WS), y1);
                hposByFeatureVectorGroup.Add(new FeatureVectorAsObject(X, Trainer.FEATURES_HPOS), y2);
            }
            IList <double> wsEntropies   = new List <double>();
            IList <double> hposEntropies = new List <double>();

            foreach (FeatureVectorAsObject x in wsByFeatureVectorGroup.Keys)
            {
                var           cats        = wsByFeatureVectorGroup[x];
                var           cats2       = hposByFeatureVectorGroup[x];
                HashBag <int> wsCats      = getCategoriesBag(cats);
                HashBag <int> hposCats    = getCategoriesBag(cats2);
                double        wsEntropy   = getNormalizedCategoryEntropy(getCategoryRatios(wsCats.Values));
                double        hposEntropy = getNormalizedCategoryEntropy(getCategoryRatios(hposCats.Values));
                wsEntropies.Add(wsEntropy);
                hposEntropies.Add(hposEntropy);
                Console.Write("{0,130} : {1},{2} {3},{4}\n", x, wsCats, wsEntropy, hposCats, hposEntropy);
            }
            Console.WriteLine("MEAN " + BuffUtils.mean(wsEntropies));
            Console.WriteLine("MEAN " + BuffUtils.mean(hposEntropies));
            float contextRichness = wsEntropies.Count / (float)numContexts;              // 0..1 where 1 means every token had different context

            Console.WriteLine("Context richness = " + contextRichness + " uniq ctxs=" + wsEntropies.Count + ", nctxs=" + numContexts);
        }