예제 #1
0
        static void NGCTrain(string sadFilePath, string happyFilePath, string objFilePath, int dataSize)
        {
            NGramClassifier NGC = new NGramClassifier();

            string[] sads = File.ReadAllLines(sadFilePath);
            string[] joys = File.ReadAllLines(happyFilePath);
            string[] objs = File.ReadAllLines(objFilePath);

            for (int i = 0; i < dataSize; i++)
            {
                NGC.recordLine(sads[i], -1);
                NGC.recordLine(joys[i], +1);
                NGC.recordLine(objs[i], 00);
            }

            NGC.Store("NGC.csv");
        }
예제 #2
0
        static void BothTest(string testFilePath, string reportFilePath, int dataSize)
        {
            NGramClassifier NGC = new NGramClassifier();

            NGC.Load("NGC.csv");

            Classifier POSC = new Classifier();

            POSC.load("sentiobj.csv");

            initPOSIndex();

            StreamWriter fs = new StreamWriter(reportFilePath, false);

            char[] space      = { ' ', '\t' };
            char[] underScore = { '_' };

            string[] tData = File.ReadAllLines(testFilePath);

            if (tData.Length < dataSize)
            {
                dataSize = tData.Length;
            }
            double total = 0;
            int    n     = 0;

            int err  = 0;
            int berr = 0;
            int btot = 0;

            double posTot = 0, negTot = 0;
            int    posCnt = 0, negCnt = 0;
            int    realPos = 0;
            int    realNut = 0;


            fs.WriteLine("i" + "," + "tag" + "," + "score" + "," + "marker" + "," + "ngcScore" + "," + "posScore" + "," + "sens" + "," + "score" + "," + "tweet");

            for (int i = 1; i < dataSize; i++)
            {
                string line = tData[i];

                string[] lineParts = line.Split(space, 4);

                string taggedTweet = lineParts[3];

                string[] tweetParts = taggedTweet.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
                string   tweet      = "";
                foreach (string twpart in tweetParts)
                {
                    string word = twpart.Split("_".ToCharArray())[0];
                    tweet += word + " ";
                }

                taggedTweet = lineParts[3];

                string tag = lineParts[0];
                tag = tag.Split(underScore, 2)[0];

                if (tag != "neutral")
                {
                    int x = 0;
                }


                double[] P = convert(posDist(taggedTweet));

                double ngcScore = NGC.score(tweet);
                double posScore = POSC.BinClassify(P);
                string sens     = POSC.Classify(P);

                if (tag == "positive")
                {
                    realPos++;
                }
                if (tag == "neutral")
                {
                    realNut++;
                }

                double score     = 0.0;
                double NPosScore = (posScore / 2) + .23;
                if (NPosScore > +1)
                {
                    NPosScore = +.71;
                }
                if (NPosScore < -1)
                {
                    NPosScore = -.6;
                }

                score = NPosScore + ngcScore;

                if (score > 0)
                {
                    posTot += score;
                    posCnt++;
                }
                else
                {
                    negTot += score;
                    negCnt++;
                }

                string marker = "neutral";
                if (score < -0.5)
                {
                    marker = "negative";
                }
                else if (score > +0.5)
                {
                    marker = "positive";
                }

                if (tag != marker)
                {
                    err++;
                }

                if (tag != "neutral" && marker != "neutral")
                {
                    btot++;
                    if (tag != marker)
                    {
                        berr++;
                    }
                }

                fs.WriteLine(i + "," + tag + "," + score + "," + marker + "," + ngcScore + "," + posScore + "," + sens + "," + tweet);
            }
            fs.Close();
            Console.WriteLine("err: " + err);
            Console.WriteLine("real biased: " + btot);
            Console.WriteLine("err on biased: " + berr);
            Console.WriteLine("err on biased ratio: " + ((double)berr / btot));
        }
예제 #3
0
        static void NGCTest(string testFilePath, string reportFilePath, int dataSize)
        {
            NGramClassifier NGC = new NGramClassifier();

            NGC.Load("NGC.csv");

            StreamWriter fs = new StreamWriter(reportFilePath, false);

            char[] space      = { ' ', '\t' };
            char[] underScore = { '_' };

            string[] tData = File.ReadAllLines(testFilePath);

            if (tData.Length < dataSize)
            {
                dataSize = tData.Length;
            }
            double total = 0;
            int    n     = 0;

            int err = 0;

            double posTot = 0, negTot = 0;
            int    posCnt = 0, negCnt = 0;
            int    realPos = 0;
            int    realNut = 0;

            for (int i = 1; i < dataSize; i++)
            {
                string line = tData[i];

                string[] lineParts = line.Split(space, 4);

                string tweet = lineParts[3];

                string tag = lineParts[0];
                if (tag != "neutral")
                {
                    int x = 0;
                }

                double score = NGC.score(tweet);

                Console.WriteLine(tag + " " + score);

                if (tag == "positive")
                {
                    realPos++;
                }
                if (tag == "neutral")
                {
                    realNut++;
                }

                if (score > 0)
                {
                    posTot += score;
                    posCnt++;
                }
                else
                {
                    negTot += score;
                    negCnt++;
                }

                string marker = "neutral";
                if (score < -0.1)
                {
                    marker = "negative";
                }
                else if (score > +.01)
                {
                    marker = "positive";
                }


                if (tag != marker)
                {
                    err++;
                }

                fs.WriteLine(i + "," + tag + "," + marker + "," + score + "," + tweet);
            }
            fs.Close();
            Console.WriteLine("err: " + err);
        }
예제 #4
0
        //Entity,KeyWords, Opiniongram Scores
        public void TEkwP(string tweetFilePath, string reportFilePath, int dataSize)
        {
            DateTime        startTime = DateTime.Now;
            NGramClassifier NGC       = new NGramClassifier();

            NGC.Load("NGC.csv");

            Classifier POSC = new Classifier();

            POSC.load("sentiobj.csv");

            Program.initPOSIndex();

            StreamWriter fs  = new StreamWriter(reportFilePath + ".csv", false);
            StreamWriter xfs = new StreamWriter(reportFilePath + ".xml", false);

            char[] space      = { ' ', '\t' };
            char[] underScore = { '_' };

            string[] tData = File.ReadAllLines(tweetFilePath);

            if (tData.Length < dataSize || dataSize == -1)
            {
                dataSize = tData.Length;
            }
            double totalScore = 0;

            int posCnt = 0, negCnt = 0;

            int i;

            fs.WriteLine("i" + "," + "Tweet" + "," + "E" + "," + "kw" + "," + "Polarity Score" + "," + "marker" + "," + "ngcScore" + "," + "posScore" + "," + "sens");
            xfs.WriteLine("<TEkwPs>");

            Console.WriteLine("TEkwP");
            Console.WriteLine("take up: lyzing tweets");

            for (i = 0; i < dataSize; i++)
            {
                string line = tData[i];
                string E    = "";
                string kw   = "";
                if (i % 100 == 0)
                {
                    Console.Write("\r" + i);
                }

                //string[] lineParts = line.Split(space, 4);

                string taggedTweet = System.Text.RegularExpressions.Regex.Replace(line, @"[^\u0000-\u007F]", string.Empty);;

                string[] tweetParts      = taggedTweet.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
                string   tweet           = "";
                int      ti              = 0;
                int      prevEntityIndex = -999;
                string   prevEntity      = "";
                foreach (string twpart in tweetParts)
                {
                    string word = twpart.Split("_".ToCharArray())[0];
                    string pos  = twpart.Split("_".ToCharArray())[1];
                    tweet += word + " ";
                    word   = CleanWord(word);
                    if (isKeyWord(pos, word))
                    {
                        kw += keyWordiFy(word) + ";";
                    }
                    if (isEntity(pos, word))
                    {
                        string bs = "";
                        if ((prevEntityIndex + 1 == ti) && !ContainFilter(prevEntity))
                        {
                            bs = "<:< ";
                        }
                        E += bs + EntitiFy(word) + ";";
                        prevEntityIndex = ti;
                        prevEntity      = word;
                    }
                    ti++;
                }
                E           = E.Replace(";<:< ", " ");
                taggedTweet = line;

                double[] P = Program.convert(Program.posDist(taggedTweet));

                double ngcScore = NGC.score(tweet);
                double posScore = POSC.BinClassify(P);
                string sens     = POSC.Classify(P);

                double pScore    = 0.0;
                double NPosScore = (posScore / 2) + 0.20;
                if (NPosScore > +1)
                {
                    NPosScore = +.85;
                }
                if (NPosScore < -1)
                {
                    NPosScore = -.85;
                }

                pScore = NPosScore + ngcScore;

                if (pScore == double.NaN)
                {
                    int q = 0;;
                }
                string marker = "neutral";
                if (pScore <= -0.45)
                {
                    marker = "negative"; negCnt++;
                }
                else if (pScore >= +0.45)
                {
                    marker = "positive"; posCnt++;
                }
                pScore     /= 2;
                totalScore += pScore;

                fs.WriteLine(i + ",\"" + tweet.Replace(",", "_CM_") + "\"," + E + "," + kw + "," + pScore + "," + marker + "," + ngcScore + "," + posScore + "," + sens);
                xfs.WriteLine("<TEkwP i='" + i + "' pScore='" + pScore + "' marker='" + marker + "'>");
                xfs.WriteLine("  <T>" + WebUtility.HtmlEncode(tweet.Replace("_CM_", ",")) + "</T>");
                xfs.WriteLine("  <E>" + WebUtility.HtmlEncode(E.Replace(";", ",")) + "</E>");
                xfs.WriteLine("  <kw>" + WebUtility.HtmlEncode(kw.Replace(";", ",")) + "</kw>");
                xfs.WriteLine("</TEkwP>");
            }

            xfs.WriteLine("</TEkwPs>");
            fs.Close();
            xfs.Close();

            DateTime endTime = DateTime.Now;
            TimeSpan ts      = endTime - startTime;

            Console.WriteLine("\r" + dataSize + "\ndone! in " + ts + "\nStored TEkwP file at " + reportFilePath + ".csv \n\tand at " + reportFilePath + ".xml");

            XmlDocument xDoc = new XmlDocument();

            xDoc.Load(reportFilePath + ".xml");
            xDoc.Normalize();
            xDoc.Save(reportFilePath + ".xml");
        }
예제 #5
0
        static void OpinionGram(string TERM, string tweetFilePath, string reportFilePath, string reportXMLpath, int dataSize)
        {
            NGramClassifier NGC = new NGramClassifier();

            NGC.Load("NGC.csv");

            Classifier POSC = new Classifier();

            POSC.load("sentiobj.csv");

            initPOSIndex();

            StreamWriter fs = new StreamWriter(reportFilePath, false);

            char[] space      = { ' ', '\t' };
            char[] underScore = { '_' };

            string[] tData = File.ReadAllLines(tweetFilePath);

            if (tData.Length < dataSize)
            {
                dataSize = tData.Length;
            }
            double totalScore = 0;

            double posTot = 0, negTot = 0;
            int    posCnt = 0, negCnt = 0;

            int i;

            fs.WriteLine("i" + "," + "score" + "," + "marker" + "," + "ngcScore" + "," + "posScore" + "," + "sens" + "," + "tweet");

            for (i = 0; i < dataSize; i++)
            {
                string line = tData[i];

                //string[] lineParts = line.Split(space, 4);

                string taggedTweet = line;

                string[] tweetParts = taggedTweet.Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
                string   tweet      = "";
                foreach (string twpart in tweetParts)
                {
                    string word = twpart.Split("_".ToCharArray())[0];
                    tweet += word + " ";
                }

                taggedTweet = line;

                double[] P = convert(posDist(taggedTweet));

                double ngcScore = NGC.score(tweet);
                double posScore = POSC.BinClassify(P);
                string sens     = POSC.Classify(P);

                double score     = 0.0;
                double NPosScore = (posScore / 2) + 0.0;
                if (NPosScore > +1)
                {
                    NPosScore = +.85;
                }
                if (NPosScore < -1)
                {
                    NPosScore = -.85;
                }

                score = NPosScore + ngcScore;

                if (score == double.NaN)
                {
                    int q = 0;;
                }
                string marker = "neutral";
                if (score < -0.25)
                {
                    marker = "negative"; negCnt++;
                }
                else if (score > +0.25)
                {
                    marker = "positive"; posCnt++;
                }

                totalScore += score;

                fs.WriteLine(i + "," + score + "," + marker + "," + ngcScore + "," + posScore + "," + sens + "," + tweet);
            }

            fs.Close();

            Console.WriteLine("avg score = " + (totalScore / i));
            Console.WriteLine("neg pct   = " + (((double)negCnt) / i));
            Console.WriteLine("pos pct   = " + (((double)posCnt) / i));
            Console.WriteLine("post count= " + i);

            StreamWriter XMLfs = new StreamWriter(reportXMLpath, false);

            XMLfs.WriteLine("<opinion entity='" + TERM + "'>");
            XMLfs.WriteLine("  <score>" + (totalScore / i).ToString("F2") + "</score>");
            XMLfs.WriteLine("  <analysis ");
            XMLfs.WriteLine("       post-count='" + i + "'");
            XMLfs.WriteLine("       percent-positive='" + (((double)posCnt) * 100 / i).ToString("F2") + "'");
            XMLfs.WriteLine("       percent-negative='" + (((double)negCnt) * 100 / i).ToString("F2") + "'" + " />");
            XMLfs.WriteLine("</opinion>");

            XMLfs.Close();
        }