static List <string> slipt(string n) { NLP nlp = new NLP(); char[] delimate = new char[] { '\'', ':', ',', '.', '(', ')', '/', '\"', '!', '*', ';', '[', ']', '{', '}' }; List <string> temp = new List <string>(); /*sentence detect*/ string[] sentences = nlp.SentDetect(n); foreach (string s in sentences) { /*tokenization*/ string[] tokens2 = nlp.Tokenize(s); /*Stemming, Lemmatization*/ for (int i = 0; i < tokens2.Length; i++) { tokens2[i] = nlp.Lemmatization(tokens2[i]); } /*Filter out stopwords*/ string[] result2 = nlp.FilterOutStopWords(tokens2); foreach (string sf in result2) { string[] te = sf.Split(delimate); foreach (string t in te) { if (t != "") { temp.Add(t); } } } } return(temp); /*Paser * Parse p = nlp.Parser(sent); * p.show();*/ //Console.ReadKey(); }
static void Main(string[] args) { string r_path = "./input_hotel_review.txt"; string w_path = "./output.txt"; StreamWriter sw = new StreamWriter(w_path); StreamReader sr = new StreamReader(r_path); NLP nlp = new NLP(); while (!sr.EndOfStream) { int index = 0, count = 0; List <indexcoll> option_index = new List <indexcoll>(); string str = sr.ReadLine(); do { index = str.IndexOf("<opinion>", index); if (index != -1) { int end_index = str.IndexOf("</opinion>", index); indexcoll temp = new indexcoll(); temp.start = index - (19 * count); index += 9; temp.end = end_index - index; option_index.Add(temp); count++; } } while (index != -1); Console.WriteLine(str); index = count = 0; //chinese word segmentation string[] result = nlp.CWS(str); foreach (string s in result) { string[] tokens = s.Split(' '); foreach (string t in tokens) { char[] separators = { '(', ')' }; string[] temp = t.Split(separators); for (int i = 0, flag = 0; i < temp[0].Length; i++, count++) { //[char][\t][POS][\t][斷詞後的長度][\t][斷詞的B,I][\t][Ans] sw.Write(temp[0][i] + "\t" + temp[1] + "\t" + temp[0].Length + "\t"); if (i == 0) { sw.Write("B\t"); if (index < option_index.Count() && option_index[index].start.Equals(count)) { sw.Write("B-OPINION\t"); flag = 1; } else { sw.Write("0\t"); if (flag == 1) { flag = 0; index++; } } } else { sw.Write("I\t"); if (flag == 1 && (option_index[index].start + option_index[index].end) > count) { sw.Write("I-OPINION\t"); } else { sw.Write("0\t"); } } sw.WriteLine(); } } } sw.WriteLine(); } }
public void Nlp(string folderName) { NLP nlp = new NLP(); StreamReader srP = new StreamReader(@"C:\Users\Ian Hsieh\Downloads\Project Testsite\SenticDic\positive-words.txt"); StreamReader srN = new StreamReader(@"C:\Users\Ian Hsieh\Downloads\Project Testsite\SenticDic\negative-words.txt"); StreamReader srGA = new StreamReader(@"C:\Users\Ian Hsieh\Downloads\Project Testsite\Gold_answer_test.txt"); StreamReader srSN = new StreamReader(@"C:\Users\Ian Hsieh\Downloads\Project Testsite\Songnames_test.txt"); string[] pw = new string[2048]; string[] nw = new string[5000]; Dictionary <string, string> answer = new Dictionary <string, string>(); int i = 0; string line, line1; while ((line = srP.ReadLine()) != null) { pw[i++] = nlp.Lemmatization(nlp.Stem(line)); } i = 0; while ((line = srN.ReadLine()) != null) { nw[i++] = nlp.Lemmatization(nlp.Stem(line)); } i = 0; while ((line = srGA.ReadLine()) != null && (line1 = srSN.ReadLine()) != null) { //if (line == "1" || line == "4") //answer.Add(line1.Replace('?', ' ').Replace(':', ' '), "1"); //else if (line == "3" || line == "2") answer.Add(line1.Replace('?', ' ').Replace(':', ' '), line); } srP.Close(); srN.Close(); srGA.Close(); srSN.Close(); StreamWriter sw = new StreamWriter(@"C:\Users\Ian Hsieh\Downloads\Project Testsite\MIR_TF_test.txt"); int number = 1, tv, no; Dictionary <string, int> dict = new Dictionary <string, int>(); Dictionary <string, int> dic = new Dictionary <string, int>(); List <string> avg = new List <string>(); foreach (string fileName in System.IO.Directory.GetFiles(folderName)) { System.IO.StreamReader file = new System.IO.StreamReader(fileName); while ((line = file.ReadLine()) != null) { string[] sents = nlp.SentDetect(line.Trim()); string[] tokens, tokens_reviews; foreach (string sent in sents) { if (sent.Contains('(')) { sent.Replace('(', ' '); } if (sent.Contains(')')) { sent.Replace(')', ' '); } if (sent.Contains('!')) { sent.Replace('!', ' '); } if (sent.Contains('#')) { sent.Replace('#', ' '); } if (sent.Contains('&')) { sent.Replace('&', ' '); } if (sent.Contains('*')) { sent.Replace('*', ' '); } if (sent.Contains(',')) { sent.Replace(',', ' '); } if (sent.Contains('.')) { sent.Replace('.', ' '); } if (sent.Contains(':')) { sent.Replace(':', ' '); } if (sent.Contains(';')) { sent.Replace(';', ' '); } if (sent.Contains('?')) { sent.Replace('?', ' '); } if (sent.Contains('"')) { sent.Replace('"', ' '); } if (sent.Contains('\\')) { sent.Replace('\\', ' '); } if (sent.Contains('/')) { sent.Replace('/', ' '); } if (sent.Contains('-')) { sent.Replace('-', ' '); } if (sent == " ") { continue; } tokens = nlp.Tokenize(sent);//tokenize sentences tokens_reviews = nlp.Lemmatization(nlp.Stem(nlp.FilterOutStopWords(tokens))); foreach (string token in tokens_reviews) { if (dict.ContainsKey(token)) { dict[token]++; } else { dict.Add(token, 1); } if (dic.ContainsKey(token)) { continue; } else { dic.Add(token, number++); } } } } /*if(System.IO.Directory.GetFiles(folderName).Last() == fname) * file.Close();*/ sw.Write("+" + answer[fileName.Substring(fileName.IndexOf('_') + 1, (fileName.IndexOf(".txt") - fileName.IndexOf('_') - 1))] + " "); tv = 0; no = 0; foreach (KeyValuePair <string, int> item in dict) { int value = 1;//0; /*foreach (string word in pw) * { * if (word == item.Key) * { * value = 1; * break; * } * } * foreach(string word in nw) * { * if (word == item.Key) * { * value = -1; * break; * } * } * if (value == 0 && !avg.Contains(item.Key)) * { * avg.Add(item.Key); * continue; * }*/ Console.WriteLine(dic[item.Key] + ":" + (item.Value * value) + "\n"); sw.Write(dic[item.Key] + ":" + (item.Value * value) + " "); tv += item.Value * value; no += item.Value; } /*foreach (string str in avg) * { * Console.WriteLine(dic[str] + ":" + ((double)dict[str] * (double)tv / (double)no) + "\n"); * sw.Write(dic[str] + ":" + ((double)dict[str] * (double)tv / (double)no) + " "); * }*/ sw.WriteLine(); dict.Clear(); avg.Clear(); } sw.Close(); }
static void Main(string[] args) { NLP nlp = new NLP(); XmlDocument doc = new XmlDocument(); doc.Load("training.xml"); XmlNodeList nodes = doc.SelectNodes("RDF/Text"); Dictionary <string, int> dict = new Dictionary <string, int>(); char[] separator = { ' ', ',', '.', '?', '!' }; int tokno = 1; StreamWriter sw = new StreamWriter("./Task1op.txt"); foreach (XmlNode n in nodes) { string text = n.InnerText; string[] sents = nlp.SentDetect(text); string p = n.Attributes["category"].Value; int polar = 0; if (p == "book") { polar = 1; } else if (p == "dvd") { polar = 2; } else if (p == "health") { polar = 3; } else if (p == "music") { polar = 4; } else if (p == "toys_games") { polar = 5; } sw.Write("+" + polar.ToString() + " "); sw.Flush(); foreach (string sent in sents) { string s = sent.ToLower(); string[] token = nlp.Tokenize(s); foreach (string t in token) { string lemma = nlp.Lemmatization(t); if (nlp.IsStopWord(lemma)) { continue; } else { if (!dict.ContainsKey(lemma)) { dict.Add(lemma, tokno); tokno++; } sw.Write(dict[lemma] + ":1 "); sw.Flush(); } } } sw.WriteLine(""); sw.Flush(); } Console.WriteLine("Finished."); Console.ReadKey(); }