Example #1
0
        public void Processing()
        {
            NLP                         nlp = new NLP();
            StreamWriter                swt = new StreamWriter("./svm-s_test.txt");
            StreamWriter                uwt = new StreamWriter("./svm-u_test.txt");
            StreamReader                sr = new StreamReader("./shifted_sent.txt");
            StreamReader                ur = new StreamReader("./unshifted_sent.txt");
            StreamReader                sf = new StreamReader("./shifted_file.txt");
            StreamReader                uf = new StreamReader("./unshifted_file.txt");
            StreamReader                srt = new StreamReader("./shiftedsent_test.txt");
            StreamReader                urt = new StreamReader("./unshiftedsent_test.txt");
            Dictionary <string, int>    dic = new Dictionary <string, int>();
            Dictionary <string, string> dicc = new Dictionary <string, string>();
            Dictionary <string, int>    dict1 = new Dictionary <string, int>();
            Dictionary <string, int>    dict2 = new Dictionary <string, int>();
            Dictionary <string, int>    dicts = new Dictionary <string, int>();
            int                         number = 1;
            int                         count1 = 0, count2 = 0;
            string                      line, file;
            string                      pol = "-1";

            while (((line = sr.ReadLine()) != null) && ((file = sf.ReadLine()) != null))
            {
                string[] tokenss;
                if (file.Contains("29590"))
                {
                    pol = "+1";
                }
                if (!dicc.ContainsKey(line))
                {
                    dicc.Add(line, pol);
                }
                tokenss = nlp.Lemmatization(nlp.Stem(nlp.FilterOutStopWords(nlp.Tokenize(line))));
                foreach (string token in tokenss)
                {
                    if (pol == "-1")
                    {
                        count1++;
                        if (dict1.ContainsKey(token))
                        {
                            dict1[token]++;
                        }
                        else
                        {
                            dict1.Add(token, 1);
                        }
                        if (dic.ContainsKey(token))
                        {
                            continue;
                        }
                        else
                        {
                            dic.Add(token, number++);
                        }
                    }
                    else
                    {
                        count2++;
                        if (dict2.ContainsKey(token))
                        {
                            dict2[token]++;
                        }
                        else
                        {
                            dict2.Add(token, 1);
                        }
                        if (dic.ContainsKey(token))
                        {
                            continue;
                        }
                        else
                        {
                            dic.Add(token, number++);
                        }
                    }
                }
            }
            while (((line = srt.ReadLine()) != null))
            {
                string[] tokenss;
                if (!dicc.ContainsKey(line))
                {
                    dicc.Add(line, pol);
                }
                tokenss = nlp.Lemmatization(nlp.Stem(nlp.FilterOutStopWords(nlp.Tokenize(line))));
                foreach (string token in tokenss)
                {
                    if (dic.ContainsKey(token))
                    {
                        continue;
                    }
                    else
                    {
                        dic.Add(token, number++);
                    }
                }
            }
            int tcount;

            srt = new StreamReader("./shiftedsent_test.txt");
            while ((line = srt.ReadLine()) != null)
            {
                swt.Write("+1 ");
                swt.Flush();
                tcount = 0;
                string[] tokens;
                tokens = nlp.Lemmatization(nlp.Stem(nlp.FilterOutStopWords(nlp.Tokenize(line))));
                foreach (string token in tokens)
                {
                    tcount++;
                    if (!dicts.ContainsKey(token))
                    {
                        dicts.Add(token, 1);
                    }
                    else
                    {
                        dicts[token]++;
                    }
                }
                foreach (KeyValuePair <string, int> kvp in dicts)
                {
                    if (!dict1.ContainsKey(kvp.Key) || !dict2.ContainsKey(kvp.Key) || (Math.Log((double)(((double)(kvp.Value) / (double)(tcount)) / ((double)(dict1[kvp.Key] + dict2[kvp.Key]) / (double)(count1 + count2)))) < 0))
                    {
                        continue;
                    }
                    else
                    {
                        swt.Write(dic[kvp.Key] + ":" + (Math.Log((double)(((double)(kvp.Value) / (double)(tcount)) / ((double)(dict1[kvp.Key] + dict2[kvp.Key]) / (double)(count1 + count2))))) + " ");
                    }
                    swt.Flush();
                }
                swt.WriteLine();
                swt.Flush();
                dicts.Clear();
            }
            srt.Close();
            swt.Close();
            sr.Close();
            sf.Close();
            dic.Clear();
            dict1.Clear();
            dict2.Clear();
            pol    = "-1";
            number = 1;
            count1 = 0;
            count2 = 0;
            while (((line = ur.ReadLine()) != null) && ((file = uf.ReadLine()) != null))
            {
                string[] tokenfs;
                if (file.Contains("29590"))
                {
                    pol = "+1";
                }
                if (!dicc.ContainsKey(line))
                {
                    dicc.Add(line, pol);
                }
                tokenfs = nlp.Lemmatization(nlp.Stem(nlp.FilterOutStopWords(nlp.Tokenize(line))));
                foreach (string token in tokenfs)
                {
                    if (pol == "-1")
                    {
                        count1++;
                        if (dict1.ContainsKey(token))
                        {
                            dict1[token]++;
                        }
                        else
                        {
                            dict1.Add(token, 1);
                        }
                        if (dic.ContainsKey(token))
                        {
                            continue;
                        }
                        else
                        {
                            dic.Add(token, number++);
                        }
                    }
                    else
                    {
                        count2++;
                        if (dict2.ContainsKey(token))
                        {
                            dict2[token]++;
                        }
                        else
                        {
                            dict2.Add(token, 1);
                        }
                        if (dic.ContainsKey(token))
                        {
                            continue;
                        }
                        else
                        {
                            dic.Add(token, number++);
                        }
                    }
                }
            }
            while (((line = urt.ReadLine()) != null))
            {
                string[] tokenss;
                if (!dicc.ContainsKey(line))
                {
                    dicc.Add(line, pol);
                }
                tokenss = nlp.Lemmatization(nlp.Stem(nlp.FilterOutStopWords(nlp.Tokenize(line))));
                foreach (string token in tokenss)
                {
                    if (dic.ContainsKey(token))
                    {
                        continue;
                    }
                    else
                    {
                        dic.Add(token, number++);
                    }
                }
            }
            urt = new StreamReader("./unshiftedsent_test.txt");
            while ((line = urt.ReadLine()) != null)
            {
                tcount = 0;
                uwt.Write("+1 ");
                uwt.Flush();
                string[] tokens;
                tokens = nlp.Lemmatization(nlp.Stem(nlp.FilterOutStopWords(nlp.Tokenize(line))));
                foreach (string token in tokens)
                {
                    tcount++;
                    if (!dicts.ContainsKey(token))
                    {
                        dicts.Add(token, 1);
                    }
                    else
                    {
                        dicts[token]++;
                    }
                }
                foreach (KeyValuePair <string, int> kvp in dicts)
                {
                    if (!dict2.ContainsKey(kvp.Key) || !dict1.ContainsKey(kvp.Key) || (Math.Log((double)(((double)(kvp.Value) / (double)(tcount)) / ((double)(dict1[kvp.Key] + dict2[kvp.Key]) / (double)(count1 + count2)))) < 0))
                    {
                        continue;
                    }
                    else
                    {
                        uwt.Write(dic[kvp.Key] + ":" + (Math.Log((double)(((double)(kvp.Value) / (double)(tcount)) / ((double)(dict1[kvp.Key] + dict2[kvp.Key]) / (double)(count1 + count2))))) + " ");
                    }
                    uwt.Flush();
                }
                uwt.WriteLine();
                uwt.Flush();
                dicts.Clear();
            }
            urt.Close();
            uwt.Close();
            ur.Close();
            uf.Close();
        }
        public void Processing(string folder1, string folder2, string folder3)
        {
            StreamWriter             sww = new StreamWriter("./svm_B.txt");
            StreamWriter             swt = new StreamWriter("./svm_B_test.txt");
            string                   folderName = System.Windows.Forms.Application.StartupPath + @folder1;
            int                      number = 1;
            NLP                      nlp = new NLP();
            int                      count1 = 0, count2 = 0;
            Dictionary <string, int> dict  = new Dictionary <string, int>();
            Dictionary <string, int> dic   = new Dictionary <string, int>();
            Dictionary <string, int> dict1 = new Dictionary <string, int>();
            Dictionary <string, int> dict2 = new Dictionary <string, int>();
            Dictionary <string, int> dicts = new Dictionary <string, int>();

            for (int i = 0; i < 3; i++)
            {
                foreach (string fname in System.IO.Directory.GetFiles(folderName))
                {
                    string line;
                    System.IO.StreamReader file = new System.IO.StreamReader(fname);
                    while ((line = file.ReadLine()) != null)
                    {
                        string[] sents = nlp.SentDetect(line.Trim());
                        string[] tokens, tokens_reviews;
                        Tuple <string, string>[] bigram;
                        foreach (string sent in sents)
                        {
                            tokens         = nlp.Tokenize(sent);//tokenize sentences
                            bigram         = nlp.Bigrams(nlp.Lemmatization(nlp.Stem(nlp.FilterOutStopWords(tokens))));
                            tokens_reviews = new string[bigram.Length];
                            int k = 0;
                            foreach (Tuple <string, string> tpl in bigram)
                            {
                                tokens_reviews[k++] = tpl.Item1 + tpl.Item2;
                            }
                            foreach (string token in tokens_reviews)
                            {
                                if (i == 0)
                                {
                                    count1++;
                                    if (dict1.ContainsKey(token))
                                    {
                                        dict1[token]++;
                                    }
                                    else
                                    {
                                        dict1.Add(token, 1);
                                    }
                                }
                                else if (i == 1)
                                {
                                    count2++;
                                    if (dict2.ContainsKey(token))
                                    {
                                        dict2[token]++;
                                    }
                                    else
                                    {
                                        dict2.Add(token, 1);
                                    }
                                }
                                if (dict.ContainsKey(token))
                                {
                                    dict[token]++;
                                }
                                else
                                {
                                    dict.Add(token, 1);
                                }
                                if (dic.ContainsKey(token))
                                {
                                    continue;
                                }
                                else
                                {
                                    dic.Add(token, number++);
                                }
                            }
                        }
                    }

                    /*if(System.IO.Directory.GetFiles(folderName).Last() == fname)
                     *  file.Close();*/
                }
                if (i == 0)
                {
                    folderName = System.Windows.Forms.Application.StartupPath + @folder2;
                }
                else if (i == 1)
                {
                    folderName = System.Windows.Forms.Application.StartupPath + @folder3;
                }
                foreach (KeyValuePair <string, int> item in dict)
                {
                    Console.WriteLine(item.Key + " " + item.Value);
                }
            }
            folderName = System.Windows.Forms.Application.StartupPath + @folder1;
            for (int i = 0; i < 3; i++)
            {
                foreach (string fname in System.IO.Directory.GetFiles(folderName))
                {
                    int    tcount = 0;
                    string line;
                    System.IO.StreamReader file = new System.IO.StreamReader(fname);
                    if (i == 0)
                    {
                        sww.Write("-1 ");
                    }
                    else if (i == 1)
                    {
                        sww.Write("+1 ");
                    }
                    else
                    {
                        swt.Write("+1 ");
                    }
                    while ((line = file.ReadLine()) != null)
                    {
                        if (i < 2)
                        {
                            sww.Flush();
                        }
                        else
                        {
                            swt.Flush();
                        }
                        string[] sents = nlp.SentDetect(line.Trim());
                        string[] tokens, tokens_reviews;
                        Tuple <string, string>[] bigram;
                        foreach (string sent in sents)
                        {
                            tokens         = nlp.Tokenize(sent);//tokenize sentences
                            bigram         = nlp.Bigrams(nlp.Lemmatization(nlp.Stem(nlp.FilterOutStopWords(tokens))));
                            tokens_reviews = new string[bigram.Length];
                            int k = 0;
                            foreach (Tuple <string, string> tpl in bigram)
                            {
                                tokens_reviews[k++] = tpl.Item1 + tpl.Item2;
                            }
                            foreach (string token in tokens_reviews)
                            {
                                tcount++;
                                if (!dicts.ContainsKey(token))
                                {
                                    dicts.Add(token, 1);
                                }
                                else
                                {
                                    dicts[token]++;
                                }
                            }
                        }
                    }
                    foreach (KeyValuePair <string, int> kvp in dicts)
                    {
                        if (i == 0)
                        {
                            if (!dict2.ContainsKey(kvp.Key) || (Math.Log((double)(((double)(kvp.Value) / (double)(tcount)) / ((double)(dict2[kvp.Key]) / (double)(count2)))) < 0))
                            {
                                continue;
                            }
                            else
                            {
                                sww.Write(dic[kvp.Key] + ":" + (Math.Log((double)(((double)(kvp.Value) / (double)(tcount)) / ((double)(dict2[kvp.Key]) / (double)(count2))))));
                            }
                        }
                        else if (i == 1)
                        {
                            if (!dict1.ContainsKey(kvp.Key) || (Math.Log((double)(((double)(kvp.Value) / (double)(tcount)) / ((double)(dict1[kvp.Key]) / (double)(count1)))) < 0))
                            {
                                continue;
                            }
                            else
                            {
                                sww.Write(dic[kvp.Key] + ":" + (Math.Log((double)(((double)(kvp.Value) / (double)(tcount)) / ((double)(dict1[kvp.Key]) / (double)(count1))))));
                            }
                        }
                        else
                        {
                            if (!dict1.ContainsKey(kvp.Key) || !dict2.ContainsKey(kvp.Key) || (Math.Log((double)(((double)(kvp.Value) / (double)(tcount)) / ((double)(dict1[kvp.Key] + dict2[kvp.Key]) / (double)(count1 + count2)))) < 0))
                            {
                                continue;
                            }
                            else
                            {
                                swt.Write(dic[kvp.Key] + ":" + (Math.Log((double)(((double)(kvp.Value) / (double)(tcount)) / ((double)(dict1[kvp.Key] + dict2[kvp.Key]) / (double)(count1 + count2))))));
                            }
                        }
                        if (i < 2)
                        {
                            sww.Flush();
                            sww.Write(" ");
                            sww.Flush();
                        }
                        else
                        {
                            swt.Flush();
                            swt.Write(" ");
                            swt.Flush();
                        }
                    }
                    if (i < 2)
                    {
                        sww.WriteLine("");
                        sww.Flush();
                    }
                    else
                    {
                        swt.WriteLine("");
                        swt.Flush();
                    }
                    if (System.IO.Directory.GetFiles(folderName).Last() == fname && i == 2)
                    {
                        file.Close();
                    }
                    dicts.Clear();
                }
                if (i == 0)
                {
                    folderName = System.Windows.Forms.Application.StartupPath + @folder2;
                }
                else if (i == 1)
                {
                    folderName = System.Windows.Forms.Application.StartupPath + @folder3;
                    sww.Close();
                }
                if (i == 2)
                {
                    swt.Close();
                }
            }
        }
        public void Processing(string folder1, string folder2, string folder3)
        {
            StreamWriter             swt = new StreamWriter("./svm-negation_test.txt");
            string                   folderName = System.Windows.Forms.Application.StartupPath + @folder1;
            int                      number = 1;
            NLP                      nlp = new NLP();
            int                      count1 = 0, count2 = 0;
            Dictionary <string, int> dict  = new Dictionary <string, int>();
            Dictionary <string, int> dic   = new Dictionary <string, int>();
            Dictionary <string, int> dict1 = new Dictionary <string, int>();
            Dictionary <string, int> dict2 = new Dictionary <string, int>();
            Dictionary <string, int> dicts = new Dictionary <string, int>();

            for (int i = 0; i < 3; i++)
            {
                foreach (string fname in System.IO.Directory.GetFiles(folderName))
                {
                    string line, neggram = "";
                    System.IO.StreamReader file = new System.IO.StreamReader(fname);
                    while ((line = file.ReadLine()) != null)
                    {
                        bool     neg   = false;
                        string[] sents = nlp.SentDetect(line.Trim());
                        string[] tokens, tokens_reviews;
                        foreach (string sent in sents)
                        {
                            string pretok = "";
                            tokens         = nlp.Tokenize(sent);//tokenize sentences
                            tokens_reviews = nlp.Lemmatization(nlp.Stem(nlp.FilterOutStopWords(tokens)));
                            foreach (string token in tokens_reviews)
                            {
                                if (nlp.IsNegation(token))
                                {
                                    neggram = pretok + token;
                                    neg     = true;
                                    if (i == 0)
                                    {
                                        count1++;
                                        if (dict1.ContainsKey(neggram))
                                        {
                                            dict1[neggram]++;
                                        }
                                        else
                                        {
                                            dict1.Add(neggram, 1);
                                        }
                                    }
                                    else if (i == 1)
                                    {
                                        count2++;
                                        if (dict2.ContainsKey(neggram))
                                        {
                                            dict2[neggram]++;
                                        }
                                        else
                                        {
                                            dict2.Add(neggram, 1);
                                        }
                                    }
                                    if (dict.ContainsKey(neggram))
                                    {
                                        dict[neggram]++;
                                    }
                                    else
                                    {
                                        dict.Add(neggram, 1);
                                    }
                                    if (dic.ContainsKey(neggram))
                                    {
                                        continue;
                                    }
                                    else
                                    {
                                        dic.Add(neggram, number++);
                                    }
                                    neggram = token;
                                    continue;
                                }
                                pretok = token;
                                if (i == 0)
                                {
                                    if (neg)
                                    {
                                        neggram = neggram + token;
                                        count1++;
                                        if (dict1.ContainsKey(neggram))
                                        {
                                            dict1[neggram]++;
                                        }
                                        else
                                        {
                                            dict1.Add(neggram, 1);
                                        }
                                        if (dict.ContainsKey(neggram))
                                        {
                                            dict[neggram]++;
                                        }
                                        else
                                        {
                                            dict.Add(neggram, 1);
                                        }
                                        if (dic.ContainsKey(neggram))
                                        {
                                            continue;
                                        }
                                        else
                                        {
                                            dic.Add(neggram, number++);
                                        }
                                        neg = false;
                                    }
                                    else
                                    {
                                        count1++;
                                        if (dict1.ContainsKey(token))
                                        {
                                            dict1[token]++;
                                        }
                                        else
                                        {
                                            dict1.Add(token, 1);
                                        }
                                        if (dict.ContainsKey(token))
                                        {
                                            dict[token]++;
                                        }
                                        else
                                        {
                                            dict.Add(token, 1);
                                        }
                                        if (dic.ContainsKey(token))
                                        {
                                            continue;
                                        }
                                        else
                                        {
                                            dic.Add(token, number++);
                                        }
                                    }
                                }
                                else if (i == 1)
                                {
                                    if (neg)
                                    {
                                        neggram = neggram + token;
                                        count2++;
                                        if (dict2.ContainsKey(neggram))
                                        {
                                            dict2[neggram]++;
                                        }
                                        else
                                        {
                                            dict2.Add(neggram, 1);
                                        }
                                        if (dict.ContainsKey(neggram))
                                        {
                                            dict[neggram]++;
                                        }
                                        else
                                        {
                                            dict.Add(neggram, 1);
                                        }
                                        if (dic.ContainsKey(neggram))
                                        {
                                            continue;
                                        }
                                        else
                                        {
                                            dic.Add(neggram, number++);
                                        }
                                        neg = false;
                                    }
                                    else
                                    {
                                        count2++;
                                        if (dict2.ContainsKey(token))
                                        {
                                            dict2[token]++;
                                        }
                                        else
                                        {
                                            dict2.Add(token, 1);
                                        }
                                        if (dict.ContainsKey(token))
                                        {
                                            dict[token]++;
                                        }
                                        else
                                        {
                                            dict.Add(token, 1);
                                        }
                                        if (dic.ContainsKey(token))
                                        {
                                            continue;
                                        }
                                        else
                                        {
                                            dic.Add(token, number++);
                                        }
                                    }
                                }
                                else
                                {
                                    if (neg)
                                    {
                                        neggram = neggram + token;
                                        if (dict.ContainsKey(neggram))
                                        {
                                            dict[neggram]++;
                                        }
                                        else
                                        {
                                            dict.Add(neggram, 1);
                                        }
                                        if (dic.ContainsKey(neggram))
                                        {
                                            continue;
                                        }
                                        else
                                        {
                                            dic.Add(neggram, number++);
                                        }
                                        neg = false;
                                    }
                                    else
                                    {
                                        if (dict.ContainsKey(token))
                                        {
                                            dict[token]++;
                                        }
                                        else
                                        {
                                            dict.Add(token, 1);
                                        }
                                        if (dic.ContainsKey(token))
                                        {
                                            continue;
                                        }
                                        else
                                        {
                                            dic.Add(token, number++);
                                        }
                                    }
                                }
                            }
                        }
                    }

                    /*if(System.IO.Directory.GetFiles(folderName).Last() == fname)
                     *  file.Close();*/
                }
                if (i == 0)
                {
                    folderName = System.Windows.Forms.Application.StartupPath + @folder2;
                }
                else if (i == 1)
                {
                    folderName = System.Windows.Forms.Application.StartupPath + @folder3;
                }
            }
            foreach (string fname in System.IO.Directory.GetFiles(folderName))
            {
                int    tcount = 0;
                string line;
                System.IO.StreamReader file = new System.IO.StreamReader(fname);
                swt.Write("+1 ");
                while ((line = file.ReadLine()) != null)
                {
                    swt.Flush();
                    string[] sents = nlp.SentDetect(line.Trim());
                    string[] tokens, tokens_reviews;
                    foreach (string sent in sents)
                    {
                        tokens         = nlp.Tokenize(sent);//tokenize sentences
                        tokens_reviews = nlp.Lemmatization(nlp.Stem(nlp.FilterOutStopWords(tokens)));
                        foreach (string token in tokens_reviews)
                        {
                            tcount++;
                            if (!dicts.ContainsKey(token))
                            {
                                dicts.Add(token, 1);
                            }
                            else
                            {
                                dicts[token]++;
                            }
                        }
                    }
                }
                foreach (KeyValuePair <string, int> kvp in dicts)
                {
                    if (!dict1.ContainsKey(kvp.Key) || !dict2.ContainsKey(kvp.Key) || (Math.Log((double)(((double)(kvp.Value) / (double)(tcount)) / ((double)(dict1[kvp.Key] + dict2[kvp.Key]) / (double)(count1 + count2)))) < 0))
                    {
                        continue;
                    }
                    else
                    {
                        swt.Write(dic[kvp.Key] + ":" + (Math.Log((double)(((double)(kvp.Value) / (double)(tcount)) / ((double)(dict1[kvp.Key] + dict2[kvp.Key]) / (double)(count1 + count2))))));
                    }
                    swt.Flush();
                    swt.Write(" ");
                    swt.Flush();
                }
                swt.WriteLine("");
                swt.Flush();
                if (System.IO.Directory.GetFiles(folderName).Last() == fname)
                {
                    file.Close();
                }
                dicts.Clear();
            }
            swt.Close();
        }