/// <summary> /// 余弦相似度 /// </summary> /// <param name="itemName">商品名</param> /// <param name="Corpus">语料</param> /// <returns></returns> public static double CosSimilarity(string itemName, string Corpus) { double Ew2 = 0, Eb2 = 0, Ewb = 0; double cosSimilarity = 0; string splitword = SplitWordHelper.RemoveSplitWord(SplitWordHelper.SplitWords(itemName)); string[] swarray = splitword.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToArray(); Dictionary <string, double> item = new Dictionary <string, double>(); TFIDFJson tfidf = JsonConvert.DeserializeObject <TFIDFJson>(Corpus); List <KeyValuePair <string, double> > corpus = tfidf.Data; foreach (string sw in swarray) { if (!string.IsNullOrEmpty(sw)) { string temp = splitword.Replace(" ", ""); MatchCollection m = Regex.Matches(temp, sw); if (!item.ContainsKey(sw)) { double tf = (double)m.Count / swarray.Length; item.Add(sw, tf); } } } for (int i = 0; i < item.Count; i++) { Ew2 = Ew2 + Math.Pow(item.ElementAt(i).Value, 2); for (int j = 0; j < corpus.Count; j++) { if (i == 0) { Eb2 = Eb2 + Math.Pow(corpus.ElementAt(j).Value, 2); } if (item.ElementAt(i).Key.Equals(corpus.ElementAt(j).Key)) { Ewb = Ewb + (item.ElementAt(i).Value *corpus.ElementAt(j).Value); } else { continue; } } } cosSimilarity = Ewb / (Math.Sqrt(Ew2) * Math.Sqrt(Eb2)); return(cosSimilarity); }
/// <summary> /// 欧氏距离相似度 /// </summary> /// <param name="itemName">商品名</param> /// <param name="Corpus">语料</param> /// <returns></returns> public static double Euclidean(string itemName, string Corpus) { double sum = 0, same = 0; double similarity = 0; string splitword = SplitWordHelper.RemoveSplitWord(SplitWordHelper.SplitWords(itemName)); string[] swarray = splitword.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToArray(); Dictionary <string, double> item = new Dictionary <string, double>(); TFIDFJson tfidf = JsonConvert.DeserializeObject <TFIDFJson>(Corpus); List <KeyValuePair <string, double> > corpus = tfidf.Data; foreach (string sw in swarray) { if (!string.IsNullOrEmpty(sw)) { string temp = splitword.Replace(" ", ""); MatchCollection m = Regex.Matches(temp, sw); if (!item.ContainsKey(sw)) { double tf = (double)m.Count / swarray.Length * 100000; int input = (int)tf; item.Add(sw, input); } } } for (int i = 0; i < item.Count; i++) { sum += Math.Pow(item.ElementAt(i).Value, 2); for (int j = 0; j < corpus.Count; j++) { if (i == 0) { sum += Math.Pow(corpus.ElementAt(j).Value, 2); } if (item.ElementAt(i).Key.Equals(corpus.ElementAt(j).Key)) { sum -= (Math.Pow(item.ElementAt(i).Value, 2) + Math.Pow(corpus.ElementAt(j).Value, 2)); same = Math.Pow((item.ElementAt(i).Value - corpus.ElementAt(j).Value), 2); sum += same; } } } similarity = Math.Sqrt(sum); return(similarity); }
/// <summary> /// 传统TF-IDF算法 /// </summary> //static public void TFIDF() //{ // List<string> allFilesDimensionSum = new List<string>(); // string srpath = @"..\..\SplitWordResultFiles\"; // string swpath = @"..\..\TF-IDF\"; // int docuNum = TypeModel.GetTypeNameStringArray().Length+1; // try // { // //读取所有文档中非重复的分词结果,拼成字符串,用于计算idf // foreach (string fileName in TypeModel.GetTypeNameStringArray()) // { // List<string> dimension = new List<string>(); // StreamReader sr = new StreamReader(srpath + fileName + ".txt", Encoding.Default); // string readLine = sr.ReadLine(); // while (!sr.EndOfStream) // { // foreach (string word in readLine.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToArray()) // { // if (!dimension.Contains(word)) // { // dimension.Add(word); // allFilesDimensionSum.Add(word); // } // } // readLine = sr.ReadLine(); // } // sr.Close(); // sr.Dispose(); // } // Console.WriteLine("读取所有文档中非重复的分词结果完成!"); // //计算TF-IDF值 // foreach (string fileName in TypeModel.GetTypeNameStringArray()) // { // Console.WriteLine("正在计算" + fileName + "的tf-idf值:"); // List<string> dimension = new List<string>(); // List<string> oneFileWords = new List<string>(); // Dictionary<string, double> noLimitDimension = new Dictionary<string, double>(); // List<KeyValuePair<string,double>> TF_IDF = new List<KeyValuePair<string,double>>(); // int oneFileWordsNum = 0; // StreamReader sr = new StreamReader(srpath + fileName + ".txt", Encoding.Default); // string readLine = sr.ReadLine(); // while (!sr.EndOfStream) // { // oneFileWords.AddRange(readLine.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToList()); // foreach (string word in readLine.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToArray()) // { // if (!dimension.Contains(word)) // { // dimension.Add(word); // } // } // readLine = sr.ReadLine(); // } // oneFileWordsNum = oneFileWords.Count; // //计算每个词的TF-IDF // for(int i = 0; i < dimension.Count; i++) // { // if (string.IsNullOrEmpty(dimension[i])) continue; // string pattern = dimension[i]; // //MatchCollection m1 = Regex.Matches(readToEnd, pattern); // //MatchCollection m2 = Regex.Matches(allFilesDimensionSum, pattern); // int match1 = 0, match2 = 0; // foreach (string m in oneFileWords) // { // if (m == pattern) match1++; // } // foreach(string m in allFilesDimensionSum) // { // if (m == pattern) match2++; // } // if (dimension[i] == "男/b"|| dimension[i] == "女/b") // { // match2--; // } // double tf = (double)match1 / oneFileWordsNum; // double idf = Math.Log((double)docuNum / (match2)); // double tfidf = tf * idf; // #region 增加权重部分 // //string[] zhname = EnToCn[fileName].Split('式'); //增加性别权重 // //if (zhname[1].Contains("运动")) zhname[1] = zhname[1].Replace("装", ""); // //if (zhname[1].Contains("裤")) zhname[1] = zhname[1].Replace("子", ""); // //if (zhname[1].Contains("裙")) zhname[1] = zhname[1].Replace("子", ""); // //if (zhname[1].Contains("毛线衣")) zhname[1] = zhname[1].Replace("线", ""); // //for (int j = 0; j < zhname.Length-1; j++) // //{ // // if (dimension[i].Contains(zhname[j])) // // { // // tfidf = tfidf * 2; // // } // //} // #endregion // if (tfidf > 0 && !dimension[i].Contains('@')) // { // noLimitDimension.Add(dimension[i], tfidf); // //TF_IDF.Add(new KeyValuePair<string, double>(dimension[i], tfidf)); // } // } // TF_IDF = SortedByValue(noLimitDimension); // Console.WriteLine("正在写入" + fileName + "的TF-IDF值……"); // TFIDFJson result = new TFIDFJson(); // result.Type = fileName; // result.Data = TF_IDF; // string toWrite = JsonConvert.SerializeObject(result); // StreamWriter sw = new StreamWriter(swpath + fileName + ".txt", true, Encoding.Default); // sw.Write(toWrite); // sw.Close(); // sw.Dispose(); // Console.WriteLine("写入" + fileName + "成功! \r\n"); // } // Console.WriteLine("完成所有文本的TF-IDF值计算!"); // } // catch(Exception ex) // { // //Console.WriteLine("出现异常:" +ex.Message); // throw ex; // } //} /// <summary> /// 改进TF-IDF算法 /// </summary> static public void TFIDF() { List <List <string> > allFilesWords = new List <List <string> >(); int allFilesTrainDataNum = 0; string srpath = @"..\..\SplitWordResultFiles\"; string swpath = @"..\..\TF-IDF\"; int docuNum = TypeModel.GetTypeNameStringArray().Length + 1; try { //读取所有文档中非重复的分词结果,拼成字符串,用于计算idf foreach (string fileName in TypeModel.GetTypeNameStringArray()) { List <string> dimension = new List <string>(); StreamReader sr = new StreamReader(srpath + fileName + ".txt", Encoding.Default); string readLine = sr.ReadLine(); while (!sr.EndOfStream) { allFilesTrainDataNum++; dimension.AddRange(readLine.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToList()); readLine = sr.ReadLine(); } allFilesWords.Add(dimension); sr.Close(); sr.Dispose(); } Console.WriteLine("读取所有文档中非重复的分词结果完成!"); //计算TF-IDF值 foreach (string fileName in TypeModel.GetTypeNameStringArray()) { Console.WriteLine("正在计算" + fileName + "的tf-idf值:"); List <string> dimension = new List <string>(); List <string> oneFileWords = new List <string>(); Dictionary <string, double> noLimitDimension = new Dictionary <string, double>(); List <KeyValuePair <string, double> > TF_IDF = new List <KeyValuePair <string, double> >(); int oneFileWordsNum = 0; int oneFileTrainDataNum = 0; StreamReader sr = new StreamReader(srpath + fileName + ".txt", Encoding.Default); string readLine = sr.ReadLine(); while (!sr.EndOfStream) { oneFileTrainDataNum++; oneFileWords.AddRange(readLine.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToList()); foreach (string word in readLine.Split(' ').Where(s => !string.IsNullOrEmpty(s)).ToArray()) { if (!dimension.Contains(word)) { dimension.Add(word); } } readLine = sr.ReadLine(); } oneFileWordsNum = oneFileWords.Count; //计算每个词的TF-IDF for (int i = 0; i < dimension.Count; i++) { if (string.IsNullOrEmpty(dimension[i])) { continue; } string pattern = dimension[i]; //MatchCollection m1 = Regex.Matches(readToEnd, pattern); //MatchCollection m2 = Regex.Matches(allFilesDimensionSum, pattern); int match1 = 0, match2 = 0; foreach (string m in oneFileWords) { if (m == pattern) { match1++; } } foreach (List <string> m in allFilesWords) { foreach (string n in m) { if (n == pattern) { match2++; } } } int antiThisClassNum = allFilesTrainDataNum - oneFileTrainDataNum; int antiThisClassMatchNum = match2 - match1; double antiThisClassMatchFactor = (double)antiThisClassMatchNum / antiThisClassNum; double thisClassMatchFactor = (double)match1 / oneFileTrainDataNum; double tf = (double)match1 / oneFileWordsNum; double idfFactor = (double)match1 / match2 * allFilesTrainDataNum; double idf = Math.Log(idfFactor); double tfidf = tf * idf; if (tfidf > 0 && !dimension[i].Contains('@')) { noLimitDimension.Add(dimension[i], tfidf); } } TF_IDF = SortedByValue(noLimitDimension); Console.WriteLine("正在写入" + fileName + "的TF-IDF值……"); TFIDFJson result = new TFIDFJson(); result.Type = fileName; result.Data = TF_IDF; string toWrite = JsonConvert.SerializeObject(result); StreamWriter sw = new StreamWriter(swpath + fileName + ".txt", true, Encoding.Default); sw.Write(toWrite); sw.Close(); sw.Dispose(); Console.WriteLine("写入" + fileName + "成功! \r\n"); } Console.WriteLine("完成所有文本的TF-IDF值计算!"); } catch (Exception ex) { throw ex; } }