public virtual Neighbor[] distances(int[] unknown, int k, double distanceThreshold) { int curTokenRuleIndex = unknown[Trainer.INDEX_PREV_EARLIEST_RIGHT_ANCESTOR]; int prevTokenRuleIndex = unknown[Trainer.INDEX_EARLIEST_LEFT_ANCESTOR]; int pr = Trainer.unrulealt(prevTokenRuleIndex)[0]; int cr = Trainer.unrulealt(curTokenRuleIndex)[0]; MyHashSet <int> vectorIndexesMatchingContext = null; // look for exact match and take result even if < k results. If we have exact matches they always win let's say if (FEATURES == org.antlr.codebuff.Trainer.FEATURES_INJECT_WS) { vectorIndexesMatchingContext = null; corpus.wsFeaturesToExemplarIndexes.TryGetValue( new FeatureVectorAsObject(unknown, FEATURES), out vectorIndexesMatchingContext); } else if (FEATURES == org.antlr.codebuff.Trainer.FEATURES_HPOS) { vectorIndexesMatchingContext = null; corpus.hposFeaturesToExemplarIndexes.TryGetValue( new FeatureVectorAsObject(unknown, FEATURES), out vectorIndexesMatchingContext); } // else might be specialized feature set for testing so ignore these caches in that case if (FEATURES == org.antlr.codebuff.Trainer.FEATURES_INJECT_WS && (vectorIndexesMatchingContext == null || vectorIndexesMatchingContext.Count <= 3)) // must have at 4 or more dist=0.0 for WS else we search wider - can't use this cache if we are testing out different feature sets { // ok, not exact. look for match with prev and current rule index org.antlr.codebuff.misc.Pair <int, int> key = new org.antlr.codebuff.misc.Pair <int, int>(pr, cr); vectorIndexesMatchingContext = null; corpus.curAndPrevTokenRuleIndexToExemplarIndexes.TryGetValue(key, out vectorIndexesMatchingContext); } if (FEATURES == org.antlr.codebuff.Trainer.FEATURES_HPOS && (vectorIndexesMatchingContext == null || vectorIndexesMatchingContext.Count < k)) { // ok, not exact. look for match with prev and current rule index org.antlr.codebuff.misc.Pair <int, int> key = new org.antlr.codebuff.misc.Pair <int, int>(pr, cr); vectorIndexesMatchingContext = null; corpus.curAndPrevTokenRuleIndexToExemplarIndexes.TryGetValue(key, out vectorIndexesMatchingContext); } if (distanceThreshold == org.antlr.codebuff.Trainer.MAX_CONTEXT_DIFF_THRESHOLD2) { // couldn't find anything, open it all up. vectorIndexesMatchingContext = null; } IList <Neighbor> distances = new List <Neighbor>(); if (vectorIndexesMatchingContext == null) { // no matching contexts for this feature, must rely on full training set int n = corpus.featureVectors.Count; // num training samples int num0 = 0; // how many 0-distance elements have we seen? If k we can stop! for (int i = 0; i < n; i++) { int[] x = corpus.featureVectors[i]; double d = distance(x, unknown); if (d <= distanceThreshold) { Neighbor neighbor = new Neighbor(corpus, d, i); distances.Add(neighbor); if (d == 0.0) { num0++; if (num0 == k) { break; } } } } } else { int num0 = 0; // how many 0-distance elements have we seen? If k we can stop! foreach (int vectorIndex in vectorIndexesMatchingContext) { int[] x = corpus.featureVectors[vectorIndex]; double d = distance(x, unknown); if (d <= distanceThreshold) { Neighbor neighbor = new Neighbor(corpus, d, vectorIndex); distances.Add(neighbor); if (d == 0.0) { num0++; if (num0 == k) { break; } } } } } return(distances.ToArray()); }
public static float[] getMedianErrorRates(LangDescriptor language, int maxNumFiles, int trials) { SubsetValidator validator = new SubsetValidator(language.corpusDir, language); IList <InputDocument> documents = Tool.load(validator.allFiles, language); float[] medians = new float[Math.Min(documents.Count, maxNumFiles) + 1]; int ncpu = Runtime.Runtime.availableProcessors(); if (FORCE_SINGLE_THREADED) { ncpu = 2; } ExecutorService pool = Executors.newFixedThreadPool(ncpu - 1); IList <Callable <Void> > jobs = new List <Callable <Void> >(); for (int i = 1; i <= Math.Min(validator.allFiles.Count, maxNumFiles); i++) { // i is corpus subset size //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int corpusSubsetSize = i; int corpusSubsetSize = i; Callable <Void> job = () => { try { IList <float?> errorRates = new List <float?>(); for (int trial = 1; trial <= trials; trial++) { // multiple trials per subset size org.antlr.codebuff.misc.Pair <InputDocument, IList <InputDocument> > sample = validator.selectSample(documents, corpusSubsetSize); Triple <Formatter, float?, float?> results = validate(language, sample.b, sample.a, true, false); // System.out.println(sample.a.fileName+" n="+corpusSubsetSize+": error="+results.c); // System.out.println("\tcorpus =\n\t\t"+Utils.join(sample.b.iterator(), "\n\t\t")); errorRates.Add(results.c); } errorRates.Sort(); int n = errorRates.Count; float median = errorRates[n / 2].Value; Console.WriteLine("median " + language.name + " error rate for n=" + corpusSubsetSize + " is " + median); medians[corpusSubsetSize] = median; } catch (Exception t) { t.printStackTrace(System.err); } return(null); }; jobs.Add(job); } pool.invokeAll(jobs); pool.shutdown(); bool terminated = pool.awaitTermination(60, TimeUnit.MINUTES); return(medians); }