예제 #1
0
        public virtual Neighbor[] distances(int[] unknown, int k, double distanceThreshold)
        {
            int curTokenRuleIndex  = unknown[Trainer.INDEX_PREV_EARLIEST_RIGHT_ANCESTOR];
            int prevTokenRuleIndex = unknown[Trainer.INDEX_EARLIEST_LEFT_ANCESTOR];
            int pr = Trainer.unrulealt(prevTokenRuleIndex)[0];
            int cr = Trainer.unrulealt(curTokenRuleIndex)[0];

            MyHashSet <int> vectorIndexesMatchingContext = null;

            // look for exact match and take result even if < k results.  If we have exact matches they always win let's say
            if (FEATURES == org.antlr.codebuff.Trainer.FEATURES_INJECT_WS)
            {
                vectorIndexesMatchingContext = null;
                corpus.wsFeaturesToExemplarIndexes.TryGetValue(
                    new FeatureVectorAsObject(unknown, FEATURES),
                    out vectorIndexesMatchingContext);
            }
            else if (FEATURES == org.antlr.codebuff.Trainer.FEATURES_HPOS)
            {
                vectorIndexesMatchingContext = null;
                corpus.hposFeaturesToExemplarIndexes.TryGetValue(
                    new FeatureVectorAsObject(unknown, FEATURES),
                    out vectorIndexesMatchingContext);
            }
            // else might be specialized feature set for testing so ignore these caches in that case

            if (FEATURES == org.antlr.codebuff.Trainer.FEATURES_INJECT_WS && (vectorIndexesMatchingContext == null || vectorIndexesMatchingContext.Count <= 3))             // must have at 4 or more dist=0.0 for WS else we search wider -  can't use this cache if we are testing out different feature sets
            {
                // ok, not exact. look for match with prev and current rule index
                org.antlr.codebuff.misc.Pair <int, int> key = new org.antlr.codebuff.misc.Pair <int, int>(pr, cr);
                vectorIndexesMatchingContext = null;
                corpus.curAndPrevTokenRuleIndexToExemplarIndexes.TryGetValue(key,
                                                                             out vectorIndexesMatchingContext);
            }
            if (FEATURES == org.antlr.codebuff.Trainer.FEATURES_HPOS && (vectorIndexesMatchingContext == null || vectorIndexesMatchingContext.Count < k))
            {
                // ok, not exact. look for match with prev and current rule index
                org.antlr.codebuff.misc.Pair <int, int> key = new org.antlr.codebuff.misc.Pair <int, int>(pr, cr);
                vectorIndexesMatchingContext = null;
                corpus.curAndPrevTokenRuleIndexToExemplarIndexes.TryGetValue(key,
                                                                             out vectorIndexesMatchingContext);
            }

            if (distanceThreshold == org.antlr.codebuff.Trainer.MAX_CONTEXT_DIFF_THRESHOLD2)
            {             // couldn't find anything, open it all up.
                vectorIndexesMatchingContext = null;
            }
            IList <Neighbor> distances = new List <Neighbor>();

            if (vectorIndexesMatchingContext == null)
            {
                // no matching contexts for this feature, must rely on full training set
                int n    = corpus.featureVectors.Count; // num training samples
                int num0 = 0;                           // how many 0-distance elements have we seen? If k we can stop!
                for (int i = 0; i < n; i++)
                {
                    int[]  x = corpus.featureVectors[i];
                    double d = distance(x, unknown);
                    if (d <= distanceThreshold)
                    {
                        Neighbor neighbor = new Neighbor(corpus, d, i);
                        distances.Add(neighbor);
                        if (d == 0.0)
                        {
                            num0++;
                            if (num0 == k)
                            {
                                break;
                            }
                        }
                    }
                }
            }
            else
            {
                int num0 = 0;                 // how many 0-distance elements have we seen? If k we can stop!
                foreach (int vectorIndex in vectorIndexesMatchingContext)
                {
                    int[]  x = corpus.featureVectors[vectorIndex];
                    double d = distance(x, unknown);
                    if (d <= distanceThreshold)
                    {
                        Neighbor neighbor = new Neighbor(corpus, d, vectorIndex);
                        distances.Add(neighbor);
                        if (d == 0.0)
                        {
                            num0++;
                            if (num0 == k)
                            {
                                break;
                            }
                        }
                    }
                }
            }
            return(distances.ToArray());
        }
예제 #2
0
        public static float[] getMedianErrorRates(LangDescriptor language, int maxNumFiles, int trials)
        {
            SubsetValidator       validator = new SubsetValidator(language.corpusDir, language);
            IList <InputDocument> documents = Tool.load(validator.allFiles, language);

            float[] medians = new float[Math.Min(documents.Count, maxNumFiles) + 1];

            int ncpu = Runtime.Runtime.availableProcessors();

            if (FORCE_SINGLE_THREADED)
            {
                ncpu = 2;
            }
            ExecutorService          pool = Executors.newFixedThreadPool(ncpu - 1);
            IList <Callable <Void> > jobs = new List <Callable <Void> >();

            for (int i = 1; i <= Math.Min(validator.allFiles.Count, maxNumFiles); i++)
            {             // i is corpus subset size
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int corpusSubsetSize = i;
                int             corpusSubsetSize = i;
                Callable <Void> job = () =>
                {
                    try
                    {
                        IList <float?> errorRates = new List <float?>();
                        for (int trial = 1; trial <= trials; trial++)
                        {                 // multiple trials per subset size
                            org.antlr.codebuff.misc.Pair <InputDocument, IList <InputDocument> > sample = validator.selectSample(documents, corpusSubsetSize);
                            Triple <Formatter, float?, float?> results = validate(language, sample.b, sample.a, true, false);
//					System.out.println(sample.a.fileName+" n="+corpusSubsetSize+": error="+results.c);
//				System.out.println("\tcorpus =\n\t\t"+Utils.join(sample.b.iterator(), "\n\t\t"));
                            errorRates.Add(results.c);
                        }
                        errorRates.Sort();
                        int   n      = errorRates.Count;
                        float median = errorRates[n / 2].Value;
                        Console.WriteLine("median " + language.name + " error rate for n=" + corpusSubsetSize + " is " + median);
                        medians[corpusSubsetSize] = median;
                    }
                    catch (Exception t)
                    {
                        t.printStackTrace(System.err);
                    }
                    return(null);
                };
                jobs.Add(job);
            }

            pool.invokeAll(jobs);
            pool.shutdown();
            bool terminated = pool.awaitTermination(60, TimeUnit.MINUTES);

            return(medians);
        }