Exemplo n.º 1
0
        /// <summary>
        /// Normalize pair frequency according to the frequencies of words derived from
        /// a corpus and the branching factor of extracted relations.
        /// sim = (2*mean_i(b_i)/(b_i+b_j)) * P(r_ij)/(P(w_i)*P(w_j))
        /// where P(r_ij) = r_ij / sum_ij(r_ij), P(w_i) = f_i / sum_i(f_i),
        /// b_i is the number of extractions with frequency more than minFreq.
        /// </summary>
        public void rerankEfreqCfreqRnum(string corpusFreqFile, int minFreq)
        {
            if (isTransformed)
            {
                Console.WriteLine("These raw frequency scores of this relation collection are already normalized");
                return;
            }

            // Load the word frequencies
            var corpusFreq = new IVocabulary(corpusFreqFile, false);

            // Get number of extractions per term with frequency for than minFreq
            Dictionary <string, int> relationsNum = getRelationsNum(minFreq);

            // Calculate mean number of relations per word
            int[]  branchesArray   = relationsNum.Values.ToArray();
            double meanBranchesNum = (double)branchesArray.Sum() / branchesArray.Length;

            // Probabilities of words
            double meanProb   = corpusFreq.getMeanFreq() / corpusFreq.getTokensNum();
            var    corpusProb = corpusFreq.getProb();

            // Probability of relations
            this.normProb();

            // Reranking
            int    missingWordsNum = 0;
            double targetProb      = 0;
            double relatumProb     = 0;
            double prevRnum        = 1;
            double targetRnum;
            double relatumRnum;

            foreach (var target in this)
            {
                foreach (var relatum in target.Value)
                {
                    targetProb =
                        (corpusProb.ContainsKey(target.Key) ? corpusProb [target.Key] : meanProb);
                    relatumProb =
                        (corpusProb.ContainsKey(relatum.Key) ? corpusProb [relatum.Key] : meanProb);
                    if (corpusProb.ContainsKey(relatum.Key) || corpusProb.ContainsKey(target.Key))
                    {
                        missingWordsNum++;
                    }

                    if (relationsNum.ContainsKey(target.Key) && relationsNum.ContainsKey(relatum.Key))
                    {
                        targetRnum  = relationsNum[target.Key];
                        relatumRnum = relationsNum[relatum.Key];
                    }
                    else if (relationsNum.ContainsKey(target.Key))
                    {
                        Console.WriteLine("Warning: Input file is not symmetric for word '{0}'. Reranking may be wrong.", relatum.Key);
                        targetRnum  = relationsNum[target.Key];
                        relatumRnum = targetRnum;
                    }
                    else
                    {
                        Console.WriteLine("Error: can not compute relation frequency for the words '{0}' and {1}. Reranking may be wrong.",
                                          target.Key, relatum.Key);
                        targetRnum  = prevRnum;
                        relatumRnum = prevRnum;
                    }

                    relatum.Value.sim =
                        (2 * meanBranchesNum / (targetRnum + relatumRnum)) * (relatum.Value.sim / (targetProb + relatumProb));
                }
            }
            isTransformed = true;
        }
Exemplo n.º 2
0
        /// <summary>
        /// Normalize pair frequency according to the frequencies of words derived from
        /// a corpus (used for extraction) or an extraction concordance.
        /// type == 1: sim = r_ij/(f_i + f_j), where w_i -- is the number of
        /// times the word w_i appeared in the corpus/concordance.
        /// type == 2: sim = P(r_ij)/(P(w_i)P(w_j)
        /// type == 3: sim = -log P(r_ij)/(P(w_i)P(w_j)
        /// where P(r_ij) = r_ij / sum_ij(r_ij), P(w_i) = f_i / sum_i(f_i)
        /// </summary>
        public void rerankEfreqCfreq(string corpusFreqFile, int type)
        {
            if (isTransformed)
            {
                Console.WriteLine("These raw frequency scores of this relation collection are already normalized");
                return;
            }

            // Load the word frequencies
            var corpusFreq = new IVocabulary(corpusFreqFile, false);

            // Reranking
            double meanFreq = corpusFreq.getMeanFreq();

            Console.WriteLine("mean frequency={0}", meanFreq);
            int missingWordsNum = 0;

            if (type == 1)
            {
                double targetFreq  = 0;
                double relatumFreq = 0;
                foreach (var target in this)
                {
                    foreach (var relatum in target.Value)
                    {
                        targetFreq =
                            (corpusFreq.ContainsKey(target.Key) ? corpusFreq [target.Key] [IVocabulary.FREQ] : meanFreq);
                        relatumFreq =
                            (corpusFreq.ContainsKey(relatum.Key) ? corpusFreq [relatum.Key] [IVocabulary.FREQ] : meanFreq);
                        if (corpusFreq.ContainsKey(relatum.Key) || corpusFreq.ContainsKey(target.Key))
                        {
                            missingWordsNum++;
                        }

                        relatum.Value.sim =
                            relatum.Value.sim / (targetFreq + relatumFreq);
                    }
                }
            }
            else if (type == 2 || type == 3)
            {
                double targetProb  = 0;
                double relatumProb = 0;
                double meanProb    = meanFreq / corpusFreq.getTokensNum();
                var    corpusProb  = corpusFreq.getProb();
                this.normProb();

                foreach (var target in this)
                {
                    foreach (var relatum in target.Value)
                    {
                        targetProb =
                            (corpusProb.ContainsKey(target.Key) ? corpusProb [target.Key] : meanProb);
                        relatumProb =
                            (corpusProb.ContainsKey(relatum.Key) ? corpusProb [relatum.Key] : meanProb);
                        if (corpusProb.ContainsKey(relatum.Key) || corpusProb.ContainsKey(target.Key))
                        {
                            missingWordsNum++;
                        }

                        relatum.Value.sim =
                            relatum.Value.sim / (targetProb + relatumProb);
                        if (type == 3)
                        {
                            relatum.Value.sim = -Math.Log(relatum.Value.sim);
                        }
                    }
                }
            }
            else
            {
                Console.WriteLine("Error: wrong type of normalization.");
                return;
            }

            Console.WriteLine("missing words = {0}", missingWordsNum);

            isTransformed = true;
        }