protected void AddTerm(int docType, string doc, string term) { if (!TermFWByGlobal.ContainsKey(term)) { TermFWByGlobal.Add(term, new TermFW(term)); } if (!TermFWByType.ContainsKey(docType, term)) { TermFWByType[docType].Add(term, new TermFW(term)); } if (!TermFWByDoc.ContainsKey(docType, doc, term)) { TermFWByDoc[docType][doc].Add(term, new TermFW(term)); } TermFWByGlobal[term].Freq += 1; TermFWByType[docType][term].Freq += 1; TermFWByDoc[docType][doc][term].Freq += 1; }
protected int GetDocTermsTrueCount(string doc1, string doc2) { var count = 0; var terms1 = TermsByDoc[0, doc1]; var terms2 = TermsByDoc[1, doc2]; var terms = terms1.Union(terms2); foreach (var term in terms) { if (TermFWByGlobal[term].Freq > 1) { if (TermFWByType.ContainsKey(0, term)) { count += 1; } if (TermFWByType.ContainsKey(1, term)) { count += 1; } } } return(count); }
/// <summary> /// 2文档中的相似度 /// </summary> /// <param name="doc1"></param> /// <param name="doc2"></param> /// <returns></returns> private double computerSimilar(string doc1, string doc2) { var terms1 = TermsByDoc[0, doc1]; var terms2 = TermsByDoc[1, doc2]; var terms = terms1.Union(terms2).ToArray(); //var tCount1 = TermsByDoc[0][doc1].Length; //var tCount2 = TermsByDoc[1][doc2].Length; double f, f1, f2; f = 0; f1 = 0; f2 = 0; foreach (var term in terms) { double v1 = 0; double v2 = 0; if (TermFWByGlobal[term].Freq > 1) { if (TermFWByType.ContainsKey(0, term) && TermFWByType.ContainsKey(1, term)) { v1 = GetTermFreq(doc1, doc2, 0, doc1, term) * GetTermWeight(doc1, doc2, 0, doc1, term); v2 = GetTermFreq(doc1, doc2, 1, doc2, term) * GetTermWeight(doc1, doc2, 1, doc2, term); } } f += v1 * v2; f1 += v1 * v1; f2 += v2 * v2; } double similarity = -1; if (f1 * f2 > 0) { similarity = f / (Math.Sqrt(f1 * f2)); } return(similarity); }