/// <summary> /// Normalize pair frequency according to the frequencies of words derived from /// a corpus and the branching factor of extracted relations. /// sim = (2*mean_i(b_i)/(b_i+b_j)) * P(r_ij)/(P(w_i)*P(w_j)) /// where P(r_ij) = r_ij / sum_ij(r_ij), P(w_i) = f_i / sum_i(f_i), /// b_i is the number of extractions with frequency more than minFreq. /// </summary> public void rerankEfreqCfreqRnum(string corpusFreqFile, int minFreq) { if (isTransformed) { Console.WriteLine("These raw frequency scores of this relation collection are already normalized"); return; } // Load the word frequencies var corpusFreq = new IVocabulary(corpusFreqFile, false); // Get number of extractions per term with frequency for than minFreq Dictionary <string, int> relationsNum = getRelationsNum(minFreq); // Calculate mean number of relations per word int[] branchesArray = relationsNum.Values.ToArray(); double meanBranchesNum = (double)branchesArray.Sum() / branchesArray.Length; // Probabilities of words double meanProb = corpusFreq.getMeanFreq() / corpusFreq.getTokensNum(); var corpusProb = corpusFreq.getProb(); // Probability of relations this.normProb(); // Reranking int missingWordsNum = 0; double targetProb = 0; double relatumProb = 0; double prevRnum = 1; double targetRnum; double relatumRnum; foreach (var target in this) { foreach (var relatum in target.Value) { targetProb = (corpusProb.ContainsKey(target.Key) ? corpusProb [target.Key] : meanProb); relatumProb = (corpusProb.ContainsKey(relatum.Key) ? corpusProb [relatum.Key] : meanProb); if (corpusProb.ContainsKey(relatum.Key) || corpusProb.ContainsKey(target.Key)) { missingWordsNum++; } if (relationsNum.ContainsKey(target.Key) && relationsNum.ContainsKey(relatum.Key)) { targetRnum = relationsNum[target.Key]; relatumRnum = relationsNum[relatum.Key]; } else if (relationsNum.ContainsKey(target.Key)) { Console.WriteLine("Warning: Input file is not symmetric for word '{0}'. Reranking may be wrong.", relatum.Key); targetRnum = relationsNum[target.Key]; relatumRnum = targetRnum; } else { Console.WriteLine("Error: can not compute relation frequency for the words '{0}' and {1}. Reranking may be wrong.", target.Key, relatum.Key); targetRnum = prevRnum; relatumRnum = prevRnum; } relatum.Value.sim = (2 * meanBranchesNum / (targetRnum + relatumRnum)) * (relatum.Value.sim / (targetProb + relatumProb)); } } isTransformed = true; }
/// <summary> /// Normalize pair frequency according to the frequencies of words derived from /// a corpus (used for extraction) or an extraction concordance. /// type == 1: sim = r_ij/(f_i + f_j), where w_i -- is the number of /// times the word w_i appeared in the corpus/concordance. /// type == 2: sim = P(r_ij)/(P(w_i)P(w_j) /// type == 3: sim = -log P(r_ij)/(P(w_i)P(w_j) /// where P(r_ij) = r_ij / sum_ij(r_ij), P(w_i) = f_i / sum_i(f_i) /// </summary> public void rerankEfreqCfreq(string corpusFreqFile, int type) { if (isTransformed) { Console.WriteLine("These raw frequency scores of this relation collection are already normalized"); return; } // Load the word frequencies var corpusFreq = new IVocabulary(corpusFreqFile, false); // Reranking double meanFreq = corpusFreq.getMeanFreq(); Console.WriteLine("mean frequency={0}", meanFreq); int missingWordsNum = 0; if (type == 1) { double targetFreq = 0; double relatumFreq = 0; foreach (var target in this) { foreach (var relatum in target.Value) { targetFreq = (corpusFreq.ContainsKey(target.Key) ? corpusFreq [target.Key] [IVocabulary.FREQ] : meanFreq); relatumFreq = (corpusFreq.ContainsKey(relatum.Key) ? corpusFreq [relatum.Key] [IVocabulary.FREQ] : meanFreq); if (corpusFreq.ContainsKey(relatum.Key) || corpusFreq.ContainsKey(target.Key)) { missingWordsNum++; } relatum.Value.sim = relatum.Value.sim / (targetFreq + relatumFreq); } } } else if (type == 2 || type == 3) { double targetProb = 0; double relatumProb = 0; double meanProb = meanFreq / corpusFreq.getTokensNum(); var corpusProb = corpusFreq.getProb(); this.normProb(); foreach (var target in this) { foreach (var relatum in target.Value) { targetProb = (corpusProb.ContainsKey(target.Key) ? corpusProb [target.Key] : meanProb); relatumProb = (corpusProb.ContainsKey(relatum.Key) ? corpusProb [relatum.Key] : meanProb); if (corpusProb.ContainsKey(relatum.Key) || corpusProb.ContainsKey(target.Key)) { missingWordsNum++; } relatum.Value.sim = relatum.Value.sim / (targetProb + relatumProb); if (type == 3) { relatum.Value.sim = -Math.Log(relatum.Value.sim); } } } } else { Console.WriteLine("Error: wrong type of normalization."); return; } Console.WriteLine("missing words = {0}", missingWordsNum); isTransformed = true; }