/// <summary> /// 清洗训练样本,返回d和total /// </summary> /// <param name="negFilePath">负面词训练集</param> /// <param name="negWords">负面词库文本</param> /// <param name="posFilePath">正面词训练集</param> /// <param name="posWords">正面词库文本</param> /// <param name="d">存储正面和负面词集的字典</param> /// <param name="stopwords">排除词</param> public static void Train_data(string negWords, string negFilters, string posWords, string posFilters, ref Dictionary <string, AddOneProb> d, string stopwords) { List <Tuple <List <string>, string> > data = new List <Tuple <List <string>, string> >(); var sent_cut = new Jieba(); sent_cut.NegWords = negFilters; sent_cut.PosWords = posFilters; foreach (var sent in posWords.Replace("\r", "").Split('\n')) { if (string.IsNullOrWhiteSpace(sent)) { continue; } sent_cut.doc = FilterSymbol(sent); sent_cut.stopwords = stopwords; var words = sent_cut.handle_sentiment(false); foreach (var word in words) { d["pos"].Add(word, CoreCalculations.IDCount, 1); } if (words != null && words.Count > 0) { data.Add(new Tuple <List <string>, string>(words, "pos")); } } Console.WriteLine("正面词库导入完毕"); foreach (var sent in negWords.Replace("\r", "").Split('\n')) { if (string.IsNullOrWhiteSpace(sent)) { continue; } sent_cut.doc = FilterSymbol(sent); sent_cut.stopwords = stopwords; var words = sent_cut.handle_sentiment(); foreach (var word in words) { d["neg"].Add(word, CoreCalculations.IDCount, 1); } if (words != null && words.Count > 0) { data.Add(new Tuple <List <string>, string>(words, "neg")); } } Console.WriteLine("负面词库导入完毕"); foreach (var k in d) {//计算频率 k.Value.CalculatRate(); } for (int i = 0; i < 2; i++) { foreach (var d_ in data) { var c = d_.Item2.ToString(); CoreCalculations.Sensor(d, d_.Item1, c);//每一条数据做训练 } } }
static void Main(string[] args) { var sentimentJson = FileHandle.ReadTxtToEnd(SentimentFilepath + "sentiment_json.txt"); var d = Train.Load(sentimentJson); var posFilters = FileHandle.ReadTxtToEnd(SentimentFilepath + "pos"); //正面词 var posWords = FileHandle.ReadTxtToEnd(SentimentFilepath + "pos_train"); //正面训练集 var negFilters = FileHandle.ReadTxtToEnd(SentimentFilepath + "neg"); //负面词 var negWords = FileHandle.ReadTxtToEnd(SentimentFilepath + "neg_train"); //负面训练集 var stopWords = FileHandle.ReadTxtToEnd(SentimentFilepath + "stopwords"); //不参与计算词 Train.Train_data(negWords, negFilters, posWords, posFilters, ref d, stopWords); //导入训练集(可以自己根据需要替换训练集) var test = FileHandle.ReadTxtToEnd(SentimentFilepath + "testpos"); //测试集 double countResult = 0; double posCount = 0; double negCount = 0; Dictionary <string, double> limitCount = new Dictionary <string, double>(); limitCount.Add("neg", 0); limitCount.Add("pos", 0); foreach (var t in test.Replace("\r", "").Split('\n')) { if (string.IsNullOrWhiteSpace(t)) { continue; } countResult++; var sent = CoreCalculations.Classify_(t, d, stopWords); double limit = Math.Abs(sent["pos"] - sent["neg"]); if (sent["neg"] > sent["pos"]) { if (limit <= 0.2) { limitCount["neg"]++; } negCount++; } else { if (limit <= 0.2) { limitCount["pos"]++; } posCount++; } } Console.WriteLine("模糊率:" + limitCount.Sum(l => l.Value) / countResult); Console.WriteLine("负面率:" + negCount / countResult); Console.WriteLine("正面率:" + posCount / countResult); countResult -= limitCount.Sum(l => l.Value); negCount -= limitCount["neg"]; posCount -= limitCount["pos"]; Console.WriteLine("去模糊后负面率:" + negCount / countResult); Console.WriteLine("去模糊后正面率:" + posCount / countResult); Console.ReadLine(); }