static void NGCTrain(string sadFilePath, string happyFilePath, string objFilePath, int dataSize) { NGramClassifier NGC = new NGramClassifier(); string[] sads = File.ReadAllLines(sadFilePath); string[] joys = File.ReadAllLines(happyFilePath); string[] objs = File.ReadAllLines(objFilePath); for (int i = 0; i < dataSize; i++) { NGC.recordLine(sads[i], -1); NGC.recordLine(joys[i], +1); NGC.recordLine(objs[i], 00); } NGC.Store("NGC.csv"); }
static void BothTest(string testFilePath, string reportFilePath, int dataSize) { NGramClassifier NGC = new NGramClassifier(); NGC.Load("NGC.csv"); Classifier POSC = new Classifier(); POSC.load("sentiobj.csv"); initPOSIndex(); StreamWriter fs = new StreamWriter(reportFilePath, false); char[] space = { ' ', '\t' }; char[] underScore = { '_' }; string[] tData = File.ReadAllLines(testFilePath); if (tData.Length < dataSize) { dataSize = tData.Length; } double total = 0; int n = 0; int err = 0; int berr = 0; int btot = 0; double posTot = 0, negTot = 0; int posCnt = 0, negCnt = 0; int realPos = 0; int realNut = 0; fs.WriteLine("i" + "," + "tag" + "," + "score" + "," + "marker" + "," + "ngcScore" + "," + "posScore" + "," + "sens" + "," + "score" + "," + "tweet"); for (int i = 1; i < dataSize; i++) { string line = tData[i]; string[] lineParts = line.Split(space, 4); string taggedTweet = lineParts[3]; string[] tweetParts = taggedTweet.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); string tweet = ""; foreach (string twpart in tweetParts) { string word = twpart.Split("_".ToCharArray())[0]; tweet += word + " "; } taggedTweet = lineParts[3]; string tag = lineParts[0]; tag = tag.Split(underScore, 2)[0]; if (tag != "neutral") { int x = 0; } double[] P = convert(posDist(taggedTweet)); double ngcScore = NGC.score(tweet); double posScore = POSC.BinClassify(P); string sens = POSC.Classify(P); if (tag == "positive") { realPos++; } if (tag == "neutral") { realNut++; } double score = 0.0; double NPosScore = (posScore / 2) + .23; if (NPosScore > +1) { NPosScore = +.71; } if (NPosScore < -1) { NPosScore = -.6; } score = NPosScore + ngcScore; if (score > 0) { posTot += score; posCnt++; } else { negTot += score; negCnt++; } string marker = "neutral"; if (score < -0.5) { marker = "negative"; } else if (score > +0.5) { marker = "positive"; } if (tag != marker) { err++; } if (tag != "neutral" && marker != "neutral") { btot++; if (tag != marker) { berr++; } } fs.WriteLine(i + "," + tag + "," + score + "," + marker + "," + ngcScore + "," + posScore + "," + sens + "," + tweet); } fs.Close(); Console.WriteLine("err: " + err); Console.WriteLine("real biased: " + btot); Console.WriteLine("err on biased: " + berr); Console.WriteLine("err on biased ratio: " + ((double)berr / btot)); }
static void NGCTest(string testFilePath, string reportFilePath, int dataSize) { NGramClassifier NGC = new NGramClassifier(); NGC.Load("NGC.csv"); StreamWriter fs = new StreamWriter(reportFilePath, false); char[] space = { ' ', '\t' }; char[] underScore = { '_' }; string[] tData = File.ReadAllLines(testFilePath); if (tData.Length < dataSize) { dataSize = tData.Length; } double total = 0; int n = 0; int err = 0; double posTot = 0, negTot = 0; int posCnt = 0, negCnt = 0; int realPos = 0; int realNut = 0; for (int i = 1; i < dataSize; i++) { string line = tData[i]; string[] lineParts = line.Split(space, 4); string tweet = lineParts[3]; string tag = lineParts[0]; if (tag != "neutral") { int x = 0; } double score = NGC.score(tweet); Console.WriteLine(tag + " " + score); if (tag == "positive") { realPos++; } if (tag == "neutral") { realNut++; } if (score > 0) { posTot += score; posCnt++; } else { negTot += score; negCnt++; } string marker = "neutral"; if (score < -0.1) { marker = "negative"; } else if (score > +.01) { marker = "positive"; } if (tag != marker) { err++; } fs.WriteLine(i + "," + tag + "," + marker + "," + score + "," + tweet); } fs.Close(); Console.WriteLine("err: " + err); }
//Entity,KeyWords, Opiniongram Scores public void TEkwP(string tweetFilePath, string reportFilePath, int dataSize) { DateTime startTime = DateTime.Now; NGramClassifier NGC = new NGramClassifier(); NGC.Load("NGC.csv"); Classifier POSC = new Classifier(); POSC.load("sentiobj.csv"); Program.initPOSIndex(); StreamWriter fs = new StreamWriter(reportFilePath + ".csv", false); StreamWriter xfs = new StreamWriter(reportFilePath + ".xml", false); char[] space = { ' ', '\t' }; char[] underScore = { '_' }; string[] tData = File.ReadAllLines(tweetFilePath); if (tData.Length < dataSize || dataSize == -1) { dataSize = tData.Length; } double totalScore = 0; int posCnt = 0, negCnt = 0; int i; fs.WriteLine("i" + "," + "Tweet" + "," + "E" + "," + "kw" + "," + "Polarity Score" + "," + "marker" + "," + "ngcScore" + "," + "posScore" + "," + "sens"); xfs.WriteLine("<TEkwPs>"); Console.WriteLine("TEkwP"); Console.WriteLine("take up: lyzing tweets"); for (i = 0; i < dataSize; i++) { string line = tData[i]; string E = ""; string kw = ""; if (i % 100 == 0) { Console.Write("\r" + i); } //string[] lineParts = line.Split(space, 4); string taggedTweet = System.Text.RegularExpressions.Regex.Replace(line, @"[^\u0000-\u007F]", string.Empty);; string[] tweetParts = taggedTweet.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); string tweet = ""; int ti = 0; int prevEntityIndex = -999; string prevEntity = ""; foreach (string twpart in tweetParts) { string word = twpart.Split("_".ToCharArray())[0]; string pos = twpart.Split("_".ToCharArray())[1]; tweet += word + " "; word = CleanWord(word); if (isKeyWord(pos, word)) { kw += keyWordiFy(word) + ";"; } if (isEntity(pos, word)) { string bs = ""; if ((prevEntityIndex + 1 == ti) && !ContainFilter(prevEntity)) { bs = "<:< "; } E += bs + EntitiFy(word) + ";"; prevEntityIndex = ti; prevEntity = word; } ti++; } E = E.Replace(";<:< ", " "); taggedTweet = line; double[] P = Program.convert(Program.posDist(taggedTweet)); double ngcScore = NGC.score(tweet); double posScore = POSC.BinClassify(P); string sens = POSC.Classify(P); double pScore = 0.0; double NPosScore = (posScore / 2) + 0.20; if (NPosScore > +1) { NPosScore = +.85; } if (NPosScore < -1) { NPosScore = -.85; } pScore = NPosScore + ngcScore; if (pScore == double.NaN) { int q = 0;; } string marker = "neutral"; if (pScore <= -0.45) { marker = "negative"; negCnt++; } else if (pScore >= +0.45) { marker = "positive"; posCnt++; } pScore /= 2; totalScore += pScore; fs.WriteLine(i + ",\"" + tweet.Replace(",", "_CM_") + "\"," + E + "," + kw + "," + pScore + "," + marker + "," + ngcScore + "," + posScore + "," + sens); xfs.WriteLine("<TEkwP i='" + i + "' pScore='" + pScore + "' marker='" + marker + "'>"); xfs.WriteLine(" <T>" + WebUtility.HtmlEncode(tweet.Replace("_CM_", ",")) + "</T>"); xfs.WriteLine(" <E>" + WebUtility.HtmlEncode(E.Replace(";", ",")) + "</E>"); xfs.WriteLine(" <kw>" + WebUtility.HtmlEncode(kw.Replace(";", ",")) + "</kw>"); xfs.WriteLine("</TEkwP>"); } xfs.WriteLine("</TEkwPs>"); fs.Close(); xfs.Close(); DateTime endTime = DateTime.Now; TimeSpan ts = endTime - startTime; Console.WriteLine("\r" + dataSize + "\ndone! in " + ts + "\nStored TEkwP file at " + reportFilePath + ".csv \n\tand at " + reportFilePath + ".xml"); XmlDocument xDoc = new XmlDocument(); xDoc.Load(reportFilePath + ".xml"); xDoc.Normalize(); xDoc.Save(reportFilePath + ".xml"); }
static void OpinionGram(string TERM, string tweetFilePath, string reportFilePath, string reportXMLpath, int dataSize) { NGramClassifier NGC = new NGramClassifier(); NGC.Load("NGC.csv"); Classifier POSC = new Classifier(); POSC.load("sentiobj.csv"); initPOSIndex(); StreamWriter fs = new StreamWriter(reportFilePath, false); char[] space = { ' ', '\t' }; char[] underScore = { '_' }; string[] tData = File.ReadAllLines(tweetFilePath); if (tData.Length < dataSize) { dataSize = tData.Length; } double totalScore = 0; double posTot = 0, negTot = 0; int posCnt = 0, negCnt = 0; int i; fs.WriteLine("i" + "," + "score" + "," + "marker" + "," + "ngcScore" + "," + "posScore" + "," + "sens" + "," + "tweet"); for (i = 0; i < dataSize; i++) { string line = tData[i]; //string[] lineParts = line.Split(space, 4); string taggedTweet = line; string[] tweetParts = taggedTweet.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); string tweet = ""; foreach (string twpart in tweetParts) { string word = twpart.Split("_".ToCharArray())[0]; tweet += word + " "; } taggedTweet = line; double[] P = convert(posDist(taggedTweet)); double ngcScore = NGC.score(tweet); double posScore = POSC.BinClassify(P); string sens = POSC.Classify(P); double score = 0.0; double NPosScore = (posScore / 2) + 0.0; if (NPosScore > +1) { NPosScore = +.85; } if (NPosScore < -1) { NPosScore = -.85; } score = NPosScore + ngcScore; if (score == double.NaN) { int q = 0;; } string marker = "neutral"; if (score < -0.25) { marker = "negative"; negCnt++; } else if (score > +0.25) { marker = "positive"; posCnt++; } totalScore += score; fs.WriteLine(i + "," + score + "," + marker + "," + ngcScore + "," + posScore + "," + sens + "," + tweet); } fs.Close(); Console.WriteLine("avg score = " + (totalScore / i)); Console.WriteLine("neg pct = " + (((double)negCnt) / i)); Console.WriteLine("pos pct = " + (((double)posCnt) / i)); Console.WriteLine("post count= " + i); StreamWriter XMLfs = new StreamWriter(reportXMLpath, false); XMLfs.WriteLine("<opinion entity='" + TERM + "'>"); XMLfs.WriteLine(" <score>" + (totalScore / i).ToString("F2") + "</score>"); XMLfs.WriteLine(" <analysis "); XMLfs.WriteLine(" post-count='" + i + "'"); XMLfs.WriteLine(" percent-positive='" + (((double)posCnt) * 100 / i).ToString("F2") + "'"); XMLfs.WriteLine(" percent-negative='" + (((double)negCnt) * 100 / i).ToString("F2") + "'" + " />"); XMLfs.WriteLine("</opinion>"); XMLfs.Close(); }