Пример #1
0
        protected void AddTerm(int docType, string doc, string term)
        {
            if (!TermFWByGlobal.ContainsKey(term))
            {
                TermFWByGlobal.Add(term, new TermFW(term));
            }
            if (!TermFWByType.ContainsKey(docType, term))
            {
                TermFWByType[docType].Add(term, new TermFW(term));
            }
            if (!TermFWByDoc.ContainsKey(docType, doc, term))
            {
                TermFWByDoc[docType][doc].Add(term, new TermFW(term));
            }

            TermFWByGlobal[term].Freq            += 1;
            TermFWByType[docType][term].Freq     += 1;
            TermFWByDoc[docType][doc][term].Freq += 1;
        }
Пример #2
0
        protected int GetDocTermsTrueCount(string doc1, string doc2)
        {
            var count  = 0;
            var terms1 = TermsByDoc[0, doc1];
            var terms2 = TermsByDoc[1, doc2];
            var terms  = terms1.Union(terms2);

            foreach (var term in terms)
            {
                if (TermFWByGlobal[term].Freq > 1)
                {
                    if (TermFWByType.ContainsKey(0, term))
                    {
                        count += 1;
                    }
                    if (TermFWByType.ContainsKey(1, term))
                    {
                        count += 1;
                    }
                }
            }
            return(count);
        }
Пример #3
0
        /// <summary>
        /// 2文档中的相似度
        /// </summary>
        /// <param name="doc1"></param>
        /// <param name="doc2"></param>
        /// <returns></returns>
        private double computerSimilar(string doc1, string doc2)
        {
            var terms1 = TermsByDoc[0, doc1];
            var terms2 = TermsByDoc[1, doc2];

            var terms = terms1.Union(terms2).ToArray();

            //var tCount1 = TermsByDoc[0][doc1].Length;
            //var tCount2 = TermsByDoc[1][doc2].Length;

            double f, f1, f2;

            f = 0; f1 = 0; f2 = 0;
            foreach (var term in terms)
            {
                double v1 = 0;
                double v2 = 0;
                if (TermFWByGlobal[term].Freq > 1)
                {
                    if (TermFWByType.ContainsKey(0, term) && TermFWByType.ContainsKey(1, term))
                    {
                        v1 = GetTermFreq(doc1, doc2, 0, doc1, term) * GetTermWeight(doc1, doc2, 0, doc1, term);
                        v2 = GetTermFreq(doc1, doc2, 1, doc2, term) * GetTermWeight(doc1, doc2, 1, doc2, term);
                    }
                }
                f  += v1 * v2;
                f1 += v1 * v1;
                f2 += v2 * v2;
            }
            double similarity = -1;

            if (f1 * f2 > 0)
            {
                similarity = f / (Math.Sqrt(f1 * f2));
            }
            return(similarity);
        }