C# (CSharp) GraduationProject TFIDFJson示例

编程语言: C# (CSharp)

命名空间/包名称: GraduationProject

类/类型: TFIDFJson

hotexamples.com的示例: 3

C# (CSharp) GraduationProject TFIDFJson - 已找到3个示例。这些是从开源项目中提取的最受好评的GraduationProject.TFIDFJson现实C# (CSharp)示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： CommonHelper.cs 项目： vincentschaw/TF-IDF-Classifier-of-Goods-Category

        /// <summary>
        /// 余弦相似度
        /// </summary>
        /// <param name="itemName">商品名</param>
        /// <param name="Corpus">语料</param>
        /// <returns></returns>
        public static double CosSimilarity(string itemName, string Corpus)
        {
            double Ew2 = 0, Eb2 = 0, Ewb = 0;
            double cosSimilarity = 0;
            string splitword     = SplitWordHelper.RemoveSplitWord(SplitWordHelper.SplitWords(itemName));

            string[] swarray = splitword.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToArray();
            Dictionary <string, double> item = new Dictionary <string, double>();
            TFIDFJson tfidf = JsonConvert.DeserializeObject <TFIDFJson>(Corpus);
            List <KeyValuePair <string, double> > corpus = tfidf.Data;

            foreach (string sw in swarray)
            {
                if (!string.IsNullOrEmpty(sw))
                {
                    string          temp = splitword.Replace(" ", "");
                    MatchCollection m    = Regex.Matches(temp, sw);
                    if (!item.ContainsKey(sw))
                    {
                        double tf = (double)m.Count / swarray.Length;
                        item.Add(sw, tf);
                    }
                }
            }

            for (int i = 0; i < item.Count; i++)
            {
                Ew2 = Ew2 + Math.Pow(item.ElementAt(i).Value, 2);
                for (int j = 0; j < corpus.Count; j++)
                {
                    if (i == 0)
                    {
                        Eb2 = Eb2 + Math.Pow(corpus.ElementAt(j).Value, 2);
                    }
                    if (item.ElementAt(i).Key.Equals(corpus.ElementAt(j).Key))
                    {
                        Ewb = Ewb + (item.ElementAt(i).Value *corpus.ElementAt(j).Value);
                    }
                    else
                    {
                        continue;
                    }
                }
            }
            cosSimilarity = Ewb / (Math.Sqrt(Ew2) * Math.Sqrt(Eb2));

            return(cosSimilarity);
        }

示例#2

显示文件

文件： CommonHelper.cs 项目： vincentschaw/TF-IDF-Classifier-of-Goods-Category

        /// <summary>
        /// 欧氏距离相似度
        /// </summary>
        /// <param name="itemName">商品名</param>
        /// <param name="Corpus">语料</param>
        /// <returns></returns>
        public static double Euclidean(string itemName, string Corpus)
        {
            double sum = 0, same = 0;
            double similarity = 0;
            string splitword  = SplitWordHelper.RemoveSplitWord(SplitWordHelper.SplitWords(itemName));

            string[] swarray = splitword.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToArray();
            Dictionary <string, double> item = new Dictionary <string, double>();
            TFIDFJson tfidf = JsonConvert.DeserializeObject <TFIDFJson>(Corpus);
            List <KeyValuePair <string, double> > corpus = tfidf.Data;

            foreach (string sw in swarray)
            {
                if (!string.IsNullOrEmpty(sw))
                {
                    string          temp = splitword.Replace(" ", "");
                    MatchCollection m    = Regex.Matches(temp, sw);
                    if (!item.ContainsKey(sw))
                    {
                        double tf    = (double)m.Count / swarray.Length * 100000;
                        int    input = (int)tf;
                        item.Add(sw, input);
                    }
                }
            }

            for (int i = 0; i < item.Count; i++)
            {
                sum += Math.Pow(item.ElementAt(i).Value, 2);
                for (int j = 0; j < corpus.Count; j++)
                {
                    if (i == 0)
                    {
                        sum += Math.Pow(corpus.ElementAt(j).Value, 2);
                    }
                    if (item.ElementAt(i).Key.Equals(corpus.ElementAt(j).Key))
                    {
                        sum -= (Math.Pow(item.ElementAt(i).Value, 2) + Math.Pow(corpus.ElementAt(j).Value, 2));
                        same = Math.Pow((item.ElementAt(i).Value - corpus.ElementAt(j).Value), 2);
                        sum += same;
                    }
                }
            }

            similarity = Math.Sqrt(sum);
            return(similarity);
        }

示例#3

显示文件

文件： CommonHelper.cs 项目： vincentschaw/TF-IDF-Classifier-of-Goods-Category

        /// <summary>
        /// 传统TF-IDF算法
        /// </summary>
        //static public void TFIDF()
        //{
        //    List<string> allFilesDimensionSum = new List<string>();
        //    string srpath = @"..\..\SplitWordResultFiles\";
        //    string swpath = @"..\..\TF-IDF\";
        //    int docuNum = TypeModel.GetTypeNameStringArray().Length+1;
        //    try
        //    {
        //        //读取所有文档中非重复的分词结果，拼成字符串,用于计算idf
        //        foreach (string fileName in TypeModel.GetTypeNameStringArray())
        //        {
        //            List<string> dimension = new List<string>();
        //            StreamReader sr = new StreamReader(srpath + fileName + ".txt", Encoding.Default);
        //            string readLine = sr.ReadLine();
        //            while (!sr.EndOfStream)
        //            {
        //                foreach (string word in readLine.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToArray())
        //                {
        //                    if (!dimension.Contains(word))
        //                    {
        //                        dimension.Add(word);
        //                        allFilesDimensionSum.Add(word);
        //                    }
        //                }
        //                readLine = sr.ReadLine();
        //            }
        //            sr.Close();
        //            sr.Dispose();
        //        }
        //        Console.WriteLine("读取所有文档中非重复的分词结果完成！");

        //        //计算TF-IDF值
        //        foreach (string fileName in TypeModel.GetTypeNameStringArray())
        //        {
        //            Console.WriteLine("正在计算" + fileName + "的tf-idf值：");
        //            List<string> dimension = new List<string>();
        //            List<string> oneFileWords = new List<string>();
        //            Dictionary<string, double> noLimitDimension = new Dictionary<string, double>();

        //            List<KeyValuePair<string,double>> TF_IDF = new List<KeyValuePair<string,double>>();
        //            int oneFileWordsNum = 0;
        //            StreamReader sr = new StreamReader(srpath + fileName + ".txt", Encoding.Default);
        //            string readLine = sr.ReadLine();
        //            while (!sr.EndOfStream)
        //            {
        //                oneFileWords.AddRange(readLine.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToList());
        //                foreach (string word in readLine.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToArray())
        //                {
        //                    if (!dimension.Contains(word))
        //                    {
        //                        dimension.Add(word);
        //                    }
        //                }
        //                readLine = sr.ReadLine();
        //            }
        //            oneFileWordsNum = oneFileWords.Count;
        //            //计算每个词的TF-IDF
        //            for(int i = 0; i < dimension.Count; i++)
        //            {
        //                if (string.IsNullOrEmpty(dimension[i])) continue;
        //                string pattern = dimension[i];
        //                //MatchCollection m1 = Regex.Matches(readToEnd, pattern);
        //                //MatchCollection m2 = Regex.Matches(allFilesDimensionSum, pattern);
        //                int match1 = 0, match2 = 0;
        //                foreach (string m in oneFileWords)
        //                {
        //                    if (m == pattern) match1++;
        //                }
        //                foreach(string m in allFilesDimensionSum)
        //                {
        //                    if (m == pattern) match2++;
        //                }
        //                if (dimension[i] == "男/b"|| dimension[i] == "女/b")
        //                {
        //                    match2--;
        //                }
        //                double tf = (double)match1 / oneFileWordsNum;
        //                double idf = Math.Log((double)docuNum / (match2));
        //                double tfidf = tf * idf;

        //                #region 增加权重部分
        //                //string[] zhname = EnToCn[fileName].Split('式');      //增加性别权重
        //                //if (zhname[1].Contains("运动")) zhname[1] = zhname[1].Replace("装", "");
        //                //if (zhname[1].Contains("裤")) zhname[1] = zhname[1].Replace("子", "");
        //                //if (zhname[1].Contains("裙")) zhname[1] = zhname[1].Replace("子", "");
        //                //if (zhname[1].Contains("毛线衣")) zhname[1] = zhname[1].Replace("线", "");
        //                //for (int j = 0; j < zhname.Length-1; j++)
        //                //{
        //                //    if (dimension[i].Contains(zhname[j]))
        //                //    {
        //                //        tfidf = tfidf * 2;
        //                //    }
        //                //}
        //                #endregion

        //                if (tfidf > 0 && !dimension[i].Contains('@'))
        //                {
        //                    noLimitDimension.Add(dimension[i], tfidf);
        //                    //TF_IDF.Add(new KeyValuePair<string, double>(dimension[i], tfidf));
        //                }
        //            }


        //            TF_IDF = SortedByValue(noLimitDimension);

        //            Console.WriteLine("正在写入" + fileName + "的TF-IDF值……");
        //            TFIDFJson result = new TFIDFJson();
        //            result.Type = fileName;
        //            result.Data = TF_IDF;
        //            string toWrite = JsonConvert.SerializeObject(result);
        //            StreamWriter sw = new StreamWriter(swpath + fileName + ".txt", true, Encoding.Default);
        //            sw.Write(toWrite);
        //            sw.Close();
        //            sw.Dispose();
        //            Console.WriteLine("写入" + fileName + "成功! \r\n");
        //        }
        //        Console.WriteLine("完成所有文本的TF-IDF值计算！");
        //    }
        //    catch(Exception ex)
        //    {
        //        //Console.WriteLine("出现异常：" +ex.Message);
        //        throw ex;
        //    }

        //}


        /// <summary>
        /// 改进TF-IDF算法
        /// </summary>
        static public void TFIDF()
        {
            List <List <string> > allFilesWords = new List <List <string> >();
            int    allFilesTrainDataNum         = 0;
            string srpath  = @"..\..\SplitWordResultFiles\";
            string swpath  = @"..\..\TF-IDF\";
            int    docuNum = TypeModel.GetTypeNameStringArray().Length + 1;

            try
            {
                //读取所有文档中非重复的分词结果，拼成字符串,用于计算idf
                foreach (string fileName in TypeModel.GetTypeNameStringArray())
                {
                    List <string> dimension = new List <string>();
                    StreamReader  sr        = new StreamReader(srpath + fileName + ".txt", Encoding.Default);
                    string        readLine  = sr.ReadLine();
                    while (!sr.EndOfStream)
                    {
                        allFilesTrainDataNum++;
                        dimension.AddRange(readLine.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToList());
                        readLine = sr.ReadLine();
                    }
                    allFilesWords.Add(dimension);
                    sr.Close();
                    sr.Dispose();
                }
                Console.WriteLine("读取所有文档中非重复的分词结果完成！");

                //计算TF-IDF值
                foreach (string fileName in TypeModel.GetTypeNameStringArray())
                {
                    Console.WriteLine("正在计算" + fileName + "的tf-idf值：");
                    List <string> dimension    = new List <string>();
                    List <string> oneFileWords = new List <string>();
                    Dictionary <string, double> noLimitDimension = new Dictionary <string, double>();

                    List <KeyValuePair <string, double> > TF_IDF = new List <KeyValuePair <string, double> >();
                    int          oneFileWordsNum     = 0;
                    int          oneFileTrainDataNum = 0;
                    StreamReader sr       = new StreamReader(srpath + fileName + ".txt", Encoding.Default);
                    string       readLine = sr.ReadLine();
                    while (!sr.EndOfStream)
                    {
                        oneFileTrainDataNum++;
                        oneFileWords.AddRange(readLine.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToList());
                        foreach (string word in readLine.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToArray())
                        {
                            if (!dimension.Contains(word))
                            {
                                dimension.Add(word);
                            }
                        }
                        readLine = sr.ReadLine();
                    }
                    oneFileWordsNum = oneFileWords.Count;
                    //计算每个词的TF-IDF
                    for (int i = 0; i < dimension.Count; i++)
                    {
                        if (string.IsNullOrEmpty(dimension[i]))
                        {
                            continue;
                        }
                        string pattern = dimension[i];
                        //MatchCollection m1 = Regex.Matches(readToEnd, pattern);
                        //MatchCollection m2 = Regex.Matches(allFilesDimensionSum, pattern);
                        int match1 = 0, match2 = 0;
                        foreach (string m in oneFileWords)
                        {
                            if (m == pattern)
                            {
                                match1++;
                            }
                        }
                        foreach (List <string> m in allFilesWords)
                        {
                            foreach (string n in m)
                            {
                                if (n == pattern)
                                {
                                    match2++;
                                }
                            }
                        }
                        int    antiThisClassNum         = allFilesTrainDataNum - oneFileTrainDataNum;
                        int    antiThisClassMatchNum    = match2 - match1;
                        double antiThisClassMatchFactor = (double)antiThisClassMatchNum / antiThisClassNum;
                        double thisClassMatchFactor     = (double)match1 / oneFileTrainDataNum;
                        double tf                       = (double)match1 / oneFileWordsNum;
                        double idfFactor                = (double)match1 / match2 * allFilesTrainDataNum;
                        double idf                      = Math.Log(idfFactor);
                        double tfidf                    = tf * idf;

                        if (tfidf > 0 && !dimension[i].Contains('@'))
                        {
                            noLimitDimension.Add(dimension[i], tfidf);
                        }
                    }


                    TF_IDF = SortedByValue(noLimitDimension);

                    Console.WriteLine("正在写入" + fileName + "的TF-IDF值……");
                    TFIDFJson result = new TFIDFJson();
                    result.Type = fileName;
                    result.Data = TF_IDF;
                    string       toWrite = JsonConvert.SerializeObject(result);
                    StreamWriter sw      = new StreamWriter(swpath + fileName + ".txt", true, Encoding.Default);
                    sw.Write(toWrite);
                    sw.Close();
                    sw.Dispose();
                    Console.WriteLine("写入" + fileName + "成功! \r\n");
                }
                Console.WriteLine("完成所有文本的TF-IDF值计算！");
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }