Ejemplo n.º 1
0
        static List <string> FindPhrase()
        {
            string filePath = @"C:\Users\maryam\Documents\Visual Studio 2015\Projects\sentiment analysis\data\after telegram filtering\24may2018.txt";

            string[]      comments = File.ReadAllLines(filePath);
            POSTagger     tagger   = new POSTagger(@"C:\Users\maryam\Documents\Visual Studio 2015\Projects\sentiment analysis\NHazm-master\NHazm\Resources\persian.tagger");
            List <string> Phrases  = new List <string>();

            foreach (string comment in comments)
            {
                List <string>     st    = tokenize(comment);
                List <TaggedWord> stTag = tagger.BatchTag(st);

                for (int j = 0; j < stTag.Count - 1; j++)
                {
                    var wordTag = stTag[j];
                    if (wordTag.tag() == "N")
                    {
                        var nextWordTag = stTag[j + 1];
                        if (nextWordTag.tag() == "N" || nextWordTag.tag() == "ADJ")
                        {
                            string phrase = st[j] + " " + st[j + 1];
                            Phrases.Add(phrase);
                        }
                    }
                }
            }

            return(Phrases);
        }
Ejemplo n.º 2
0
        public void BatchTagTest()
        {
            POSTagger tagger = new POSTagger();

            string[]          input    = new string[] { "من", "به", "مدرسه", "رفته بودم", "." };
            List <TaggedWord> expected = new List <TaggedWord>();

            expected.Add(new TaggedWord("من", "PR"));
            expected.Add(new TaggedWord("به", "PREP"));
            expected.Add(new TaggedWord("مدرسه", "N"));
            expected.Add(new TaggedWord("رفته بودم", "V"));
            expected.Add(new TaggedWord(".", "PUNC"));
            List <TaggedWord> actual = tagger.BatchTag(new List <string>(input));

            Assert.AreEqual(expected.Count, actual.Count, "Failed to tagged words of '" + string.Join(" ", input) + "' sentence");
            for (int i = 0; i < expected.Count; i++)
            {
                var actualTaggedWord   = actual[i];
                var expectedTaggedWord = expected[i];
                if (!actualTaggedWord.tag().Equals(expectedTaggedWord.tag()))
                {
                    Assert.AreEqual(expected[i], actual[i], "Failed to tagged words of '" + string.Join(" ", input) + "' sentence");
                }
            }
        }
Ejemplo n.º 3
0
        public void BatchTagTest()
        {
            POSTagger tagger = new POSTagger();

            string[] input = new string[] { "من", "به", "مدرسه", "رفته بودم", "."};
            List<TaggedWord> expected = new List<TaggedWord>();
            expected.Add(new TaggedWord("من","PR"));
            expected.Add(new TaggedWord("به","PREP"));
            expected.Add(new TaggedWord("مدرسه","N"));
            expected.Add(new TaggedWord("رفته بودم","V"));
            expected.Add(new TaggedWord(".","PUNC"));
            List<TaggedWord> actual = tagger.BatchTag(new List<string>(input));

            Assert.AreEqual(expected.Count, actual.Count, "Failed to tagged words of '" + string.Join(" ", input) + "' sentence");
            for (int i = 0; i < expected.Count; i++)
            {
                var actualTaggedWord = actual[i];
                var expectedTaggedWord = expected[i];
                if (!actualTaggedWord.tag().Equals(expectedTaggedWord.tag()))
                    Assert.AreEqual(expected[i], actual[i], "Failed to tagged words of '" + string.Join(" ", input) + "' sentence");
            }
        }
Ejemplo n.º 4
0
        static void TrainData(string class1FilePath, string class2FilePath, string dataTrainFilePath)
        {
            string[] c1 = File.ReadAllLines(class1FilePath);
            string[] c2 = File.ReadAllLines(class2FilePath);

            int count = 2 * Math.Min(c1.Count(), c2.Count());

            string[] contents = new string[count];
            int      c1Index  = 0;
            int      c2Index  = 0;

            POSTagger tagger = new POSTagger(@"C:\Users\maryam\Documents\Visual Studio 2015\Projects\mallet\NHazm-master\NHazm\Resources\persian.tagger");

            for (int i = 0; i < count; i++)
            {
                string   label;
                string[] features = new string[100];
                if (i % 2 == 0)
                {
                    label        = "politics";
                    features[0]  = c1[c1Index].Contains("تیم") ? "1" : "0";
                    features[1]  = c1[c1Index].Contains("فوتبال") ? "1" : "0";
                    features[2]  = c1[c1Index].Contains("لیگ") ? "1" : "0";
                    features[3]  = c1[c1Index].Contains("قهرمان") ? "1" : "0";
                    features[4]  = c1[c1Index].Contains("ملی") ? "1" : "0";
                    features[5]  = c1[c1Index].Contains("استقلال") ? "1" : "0";
                    features[6]  = c1[c1Index].Contains("هفته") ? "1" : "0";
                    features[7]  = c1[c1Index].Contains("آسیا") ? "1" : "0";
                    features[8]  = c1[c1Index].Contains("فیلم") ? "1" : "0";
                    features[9]  = c1[c1Index].Contains("رقابت") ? "1" : "0";
                    features[10] = c1[c1Index].Contains("پرسپولیس") ? "1" : "0";
                    features[11] = c1[c1Index].Contains("مجلس") ? "1" : "0";
                    features[12] = c1[c1Index].Contains("وزیر") ? "1" : "0";
                    features[13] = c1[c1Index].Contains("استیضاح") ? "1" : "0";
                    features[14] = c1[c1Index].Contains("کشور") ? "1" : "0";
                    features[15] = c1[c1Index].Contains("خبرنگار") ? "1" : "0";
                    features[16] = c1[c1Index].Contains("خارجه") ? "1" : "0";
                    features[17] = c1[c1Index].Contains("گفت و گو") ? "1" : "0";
                    features[18] = c1[c1Index].Contains("نماینده") ? "1" : "0";
                    features[19] = c1[c1Index].Contains("اسلامی") ? "1" : "0";
                    features[20] = c1[c1Index].Contains("رهبر") ? "1" : "0";
                    features[21] = c1[c1Index].Contains("جلسه") ? "1" : "0";

                    string[]          st    = c1[c1Index].Split(' ');
                    List <TaggedWord> stTag = tagger.BatchTag(new List <string>(st));

                    for (int j = 0; j < stTag.Count; j++)
                    {
                        var wordTag = stTag[j];
                        features[22 + j] = wordTag.tag();
                    }

                    for (int j = 0; j < st.Length - 1; j++)
                    {
                        features[22 + stTag.Count + j] = st[j] + " " + st[j + 1];
                    }

                    c1Index++;

                    contents[i] = i.ToString() + " " +
                                  label + " ";

                    for (int j = 0; j < 22 + stTag.Count + st.Length - 1; j++)
                    {
                        contents[i] += "f" + j.ToString() + " " + features[j] + " ";
                    }
                }
                else
                {
                    label        = "sports";
                    features[0]  = c2[c2Index].Contains("تیم") ? "1" : "0";
                    features[1]  = c2[c2Index].Contains("فوتبال") ? "1" : "0";
                    features[2]  = c2[c2Index].Contains("لیگ") ? "1" : "0";
                    features[3]  = c2[c2Index].Contains("قهرمان") ? "1" : "0";
                    features[4]  = c2[c2Index].Contains("ملی") ? "1" : "0";
                    features[5]  = c2[c2Index].Contains("استقلال") ? "1" : "0";
                    features[6]  = c2[c2Index].Contains("هفته") ? "1" : "0";
                    features[7]  = c2[c2Index].Contains("آسیا") ? "1" : "0";
                    features[8]  = c2[c2Index].Contains("فیلم") ? "1" : "0";
                    features[9]  = c2[c2Index].Contains("رقابت") ? "1" : "0";
                    features[10] = c2[c2Index].Contains("پرسپولیس") ? "1" : "0";
                    features[11] = c2[c2Index].Contains("مجلس") ? "1" : "0";
                    features[12] = c2[c2Index].Contains("وزیر") ? "1" : "0";
                    features[13] = c2[c2Index].Contains("استیضاح") ? "1" : "0";
                    features[14] = c2[c2Index].Contains("کشور") ? "1" : "0";
                    features[15] = c2[c2Index].Contains("خبرنگار") ? "1" : "0";
                    features[16] = c2[c2Index].Contains("خارجه") ? "1" : "0";
                    features[17] = c2[c2Index].Contains("گفت و گو") ? "1" : "0";
                    features[18] = c2[c2Index].Contains("نماینده") ? "1" : "0";
                    features[19] = c2[c2Index].Contains("اسلامی") ? "1" : "0";
                    features[20] = c2[c2Index].Contains("رهبر") ? "1" : "0";
                    features[21] = c2[c2Index].Contains("جلسه") ? "1" : "0";

                    string[]          st    = c2[c1Index].Split(' ');
                    List <TaggedWord> stTag = tagger.BatchTag(new List <string>(st));

                    for (int j = 0; j < stTag.Count; j++)
                    {
                        var wordTag = stTag[j];
                        features[22 + j] = wordTag.tag();
                    }

                    for (int j = 0; j < st.Length - 1; j++)
                    {
                        features[22 + stTag.Count + j] = st[j] + " " + st[j + 1];
                    }

                    c2Index++;

                    contents[i] = i.ToString() + " " +
                                  label + " ";

                    for (int j = 0; j < 22 + stTag.Count + st.Length - 1; j++)
                    {
                        contents[i] += "f" + j.ToString() + " " + features[j] + " ";
                    }
                }
            }
            File.WriteAllLines(dataTrainFilePath, contents);
        }