public static ArrayListMultiMap <FeatureVectorAsObject, int> getWSContextCategoryMap(Corpus corpus) { ArrayListMultiMap <FeatureVectorAsObject, int> wsByFeatureVectorGroup = ArrayListMultiMap <FeatureVectorAsObject, int> .create(); int numContexts = corpus.featureVectors.Count; for (int i = 0; i < numContexts; i++) { int[] X = corpus.featureVectors[i]; int y = corpus.injectWhitespace[i]; wsByFeatureVectorGroup.Add(new FeatureVectorAsObject(X, Trainer.FEATURES_INJECT_WS), y); } return(wsByFeatureVectorGroup); }
public static void examineCorpus(Corpus corpus) { ArrayListMultiMap <FeatureVectorAsObject, int> wsByFeatureVectorGroup = ArrayListMultiMap <FeatureVectorAsObject, int> .create(); ArrayListMultiMap <FeatureVectorAsObject, int> hposByFeatureVectorGroup = ArrayListMultiMap <FeatureVectorAsObject, int> .create(); int numContexts = corpus.featureVectors.Count; for (int i = 0; i < numContexts; i++) { int[] X = corpus.featureVectors[i]; int y1 = corpus.injectWhitespace[i]; int y2 = corpus.hpos[i]; wsByFeatureVectorGroup.Add(new FeatureVectorAsObject(X, Trainer.FEATURES_INJECT_WS), y1); hposByFeatureVectorGroup.Add(new FeatureVectorAsObject(X, Trainer.FEATURES_HPOS), y2); } IList <double> wsEntropies = new List <double>(); IList <double> hposEntropies = new List <double>(); foreach (FeatureVectorAsObject x in wsByFeatureVectorGroup.Keys) { var cats = wsByFeatureVectorGroup[x]; var cats2 = hposByFeatureVectorGroup[x]; HashBag <int> wsCats = getCategoriesBag(cats); HashBag <int> hposCats = getCategoriesBag(cats2); double wsEntropy = getNormalizedCategoryEntropy(getCategoryRatios(wsCats.Values)); double hposEntropy = getNormalizedCategoryEntropy(getCategoryRatios(hposCats.Values)); wsEntropies.Add(wsEntropy); hposEntropies.Add(hposEntropy); Console.Write("{0,130} : {1},{2} {3},{4}\n", x, wsCats, wsEntropy, hposCats, hposEntropy); } Console.WriteLine("MEAN " + BuffUtils.mean(wsEntropies)); Console.WriteLine("MEAN " + BuffUtils.mean(hposEntropies)); float contextRichness = wsEntropies.Count / (float)numContexts; // 0..1 where 1 means every token had different context Console.WriteLine("Context richness = " + contextRichness + " uniq ctxs=" + wsEntropies.Count + ", nctxs=" + numContexts); }
public static ArrayListMultiMap <FeatureVectorAsObject, int> getHPosContextCategoryMap(Corpus corpus) { ArrayListMultiMap <FeatureVectorAsObject, int> hposByFeatureVectorGroup = ArrayListMultiMap <FeatureVectorAsObject, int> .create(); int numContexts = corpus.featureVectors.Count; for (int i = 0; i < numContexts; i++) { int[] X = corpus.featureVectors[i]; int y = corpus.hpos[i]; hposByFeatureVectorGroup.Add(new FeatureVectorAsObject(X, Trainer.FEATURES_HPOS), y); } return(hposByFeatureVectorGroup); }