public static ArrayListMultiMap <FeatureVectorAsObject, int> getHPosContextCategoryMap(Corpus corpus) { ArrayListMultiMap <FeatureVectorAsObject, int> hposByFeatureVectorGroup = ArrayListMultiMap <FeatureVectorAsObject, int> .create(); int numContexts = corpus.featureVectors.Count; for (int i = 0; i < numContexts; i++) { int[] X = corpus.featureVectors[i]; int y = corpus.hpos[i]; hposByFeatureVectorGroup.Add(new FeatureVectorAsObject(X, Trainer.FEATURES_HPOS), y); } return(hposByFeatureVectorGroup); }
public static ArrayListMultiMap <FeatureVectorAsObject, int> getWSContextCategoryMap(Corpus corpus) { ArrayListMultiMap <FeatureVectorAsObject, int> wsByFeatureVectorGroup = ArrayListMultiMap <FeatureVectorAsObject, int> .create(); int numContexts = corpus.featureVectors.Count; for (int i = 0; i < numContexts; i++) { int[] X = corpus.featureVectors[i]; int y = corpus.injectWhitespace[i]; wsByFeatureVectorGroup.Add(new FeatureVectorAsObject(X, Trainer.FEATURES_INJECT_WS), y); } return(wsByFeatureVectorGroup); }
public static void runCaptureForOneLanguage(LangDescriptor language) { IList <string> filenames = Tool.getFilenames(language.corpusDir, language.fileRegex); IList <InputDocument> documents = Tool.load(filenames, language); foreach (string fileName in filenames) { // Examine info for this file in isolation Corpus fileCorpus = new Corpus(fileName, language); fileCorpus.train(); Console.WriteLine(fileName); // examineCorpus(corpus); ArrayListMultiMap <FeatureVectorAsObject, int> ws = getWSContextCategoryMap(fileCorpus); ArrayListMultiMap <FeatureVectorAsObject, int> hpos = getHPosContextCategoryMap(fileCorpus); // Compare with corpus minus this file string path = fileName; IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path)); Corpus corpus = new Corpus(others, language); corpus.train(); // examineCorpus(corpus); ArrayListMultiMap <FeatureVectorAsObject, int> corpus_ws = getWSContextCategoryMap(corpus); ArrayListMultiMap <FeatureVectorAsObject, int> corpus_hpos = getHPosContextCategoryMap(corpus); foreach (FeatureVectorAsObject x in ws.Keys) { HashBag <int> fwsCats = getCategoriesBag(ws[x]); IList <float> fwsRatios = getCategoryRatios(fwsCats.Values); HashBag <int> wsCats = getCategoriesBag(corpus_ws[x]); IList <float> wsRatios = getCategoryRatios(wsCats.Values); // compare file predictions with corpus predictions if (!fwsRatios.SequenceEqual(wsRatios)) { Console.WriteLine(fwsRatios + " vs " + wsRatios); } HashBag <int> fhposCats = getCategoriesBag(hpos[x]); HashBag <int> hposCats = getCategoriesBag(corpus_hpos[x]); } break; } }
public static void examineCorpus(Corpus corpus) { ArrayListMultiMap <FeatureVectorAsObject, int> wsByFeatureVectorGroup = ArrayListMultiMap <FeatureVectorAsObject, int> .create(); ArrayListMultiMap <FeatureVectorAsObject, int> hposByFeatureVectorGroup = ArrayListMultiMap <FeatureVectorAsObject, int> .create(); int numContexts = corpus.featureVectors.Count; for (int i = 0; i < numContexts; i++) { int[] X = corpus.featureVectors[i]; int y1 = corpus.injectWhitespace[i]; int y2 = corpus.hpos[i]; wsByFeatureVectorGroup.Add(new FeatureVectorAsObject(X, Trainer.FEATURES_INJECT_WS), y1); hposByFeatureVectorGroup.Add(new FeatureVectorAsObject(X, Trainer.FEATURES_HPOS), y2); } IList <double> wsEntropies = new List <double>(); IList <double> hposEntropies = new List <double>(); foreach (FeatureVectorAsObject x in wsByFeatureVectorGroup.Keys) { var cats = wsByFeatureVectorGroup[x]; var cats2 = hposByFeatureVectorGroup[x]; HashBag <int> wsCats = getCategoriesBag(cats); HashBag <int> hposCats = getCategoriesBag(cats2); double wsEntropy = getNormalizedCategoryEntropy(getCategoryRatios(wsCats.Values)); double hposEntropy = getNormalizedCategoryEntropy(getCategoryRatios(hposCats.Values)); wsEntropies.Add(wsEntropy); hposEntropies.Add(hposEntropy); Console.Write("{0,130} : {1},{2} {3},{4}\n", x, wsCats, wsEntropy, hposCats, hposEntropy); } Console.WriteLine("MEAN " + BuffUtils.mean(wsEntropies)); Console.WriteLine("MEAN " + BuffUtils.mean(hposEntropies)); float contextRichness = wsEntropies.Count / (float)numContexts; // 0..1 where 1 means every token had different context Console.WriteLine("Context richness = " + contextRichness + " uniq ctxs=" + wsEntropies.Count + ", nctxs=" + numContexts); }