public string NormalizeSample() { string result = TwitterKoreanProcessorCS.Normalize("정규화(Normalize) 예제입니당~"); // "정규화(Normalize) 예제입니다~" return(result); }
public string ExtractPhraseSample1() { StringBuilder result = new StringBuilder(); var tokens = TwitterKoreanProcessorCS.Tokenize("토큰화 처리 이후 어구를 추출하는 예제입니당ㅇㅇㅇ"); var phrases = TwitterKoreanProcessorCS.ExtractPhrases(tokens); foreach (var phrase in phrases) { result.AppendLine("---------"); result.AppendFormat("{0} | ", phrase.Pos.ToString()); foreach (var token in phrase.Tokens) { result.AppendFormat(format: "{0}({1}) [{2},{3}] / ", args: new object[] { token.Text, token.Pos.ToString(), token.Offset, token.Length }); } result.AppendLine(); } // Noun | 토큰(ProperNoun) [0,2] / // Noun | 처리(Noun) [4,2] / // ... // Noun | 어구(Noun) [10,2] / return(result.ToString()); }
private void samafc_internal(int starts, int ends) { var cc = (ends - starts) / 100; var mwords = new Dictionary <string, int>(); for (int i = starts; i < ends; i++) { try { if (Talks[i].State == TalkState.Message || Talks[i].State == TalkState.Append) { var tokens = TwitterKoreanProcessorCS.Tokenize(Talks[i].Content); var stem = TwitterKoreanProcessorCS.Stem(tokens); foreach (var word in stem) { if (word.Pos == KoreanPos.ProperNoun)/* || word.Pos == KoreanPos.Noun)*/ { if (!mwords.ContainsKey(word.Text)) { mwords.Add(word.Text, 1); } else { mwords[word.Text] += 1; } } } } } catch (Exception e) { Console.Console.Instance.WriteLine(Talks[i].Content + "\r\n" + e.Message); } var x = System.Threading.Interlocked.Increment(ref cnt); if ((i % cc == 0) || i == ends) { lock (cpb) cpb.SetProgress(x / (float)Talks.Count * 100); } } lock (Words) { foreach (var ww in mwords) { if (!Words.ContainsKey(ww.Key)) { Words.Add(ww.Key, 1); } else { Words[ww.Key] += ww.Value; } } } }
public string TokensToStringsSample1() { var tokens = TwitterKoreanProcessorCS.Tokenize("토큰화를 처리하는 예제입니다. 문자열화는 덤"); var results = TwitterKoreanProcessorCS.TokensToStrings(tokens); // 토큰 / 화 / 를 / 처리 / 하는 / 예제 / 입니 / 다 / . / 문자열 / 화 / 는 / 덤 return(string.Join(" / ", results)); }
//토큰화 public string TokenizeSample() { StringBuilder result = new StringBuilder(); var tokens = TwitterKoreanProcessorCS.Tokenize(text); var results = TwitterKoreanProcessorCS.TokensToStrings(tokens); return(string.Join("/", results)); }
/// <summary> /// 한국어 input을 감정 분석합니다. /// 감정 분석 결과는 다음 GetSentimentAndFlush() 호출 시까지 누적됩니다. /// </summary> /// <param name="input">한국어 문자열</param> public override void Analyze(string input) { if (!HasStart) { return; } string a = Hangul.Assemble(input, true); a = TwitterKoreanProcessorCS.Normalize(a); var b = TwitterKoreanProcessorCS.Tokenize(a); b = TwitterKoreanProcessorCS.Stem(b); b = b.SkipWhile((e) => !( e.Pos == KoreanPos.Adjective || e.Pos == KoreanPos.Adverb || e.Pos == KoreanPos.Exclamation || e.Pos == KoreanPos.Noun || e.Pos == KoreanPos.NounPrefix || e.Pos == KoreanPos.Verb || e.Pos == KoreanPos.VerbPrefix )); var c = TwitterKoreanProcessorCS.TokensToStrings(b); int count = c.Count(); List <string> tokens = c.ToList(); for (int j = 1; j <= NUM_OF_GRAMS; j++) { for (int i = 0; i < count - j + 1; i++) { string word = String.Join(";", tokens.GetRange(i, j)); Console.WriteLine(word); if (koreanSentimentDictionary[j - 1].ContainsKey(word)) { void UpdateAggregate(object[] args) { KoreanWordSentiment sentiment = args[0] as KoreanWordSentiment; int weight = 1; aggregatePolarity[(int)sentiment.Polarity] += weight; aggregateIntensity[(int)sentiment.Intensity] += weight; aggregateSubjectivityType[(int)sentiment.SubjectivityType] += weight; aggregateSubjectivityPolarity[(int)sentiment.SubjectivityPolarity] += weight; } Util.TaskQueue.Add("aggregateKoreanSentiment", UpdateAggregate, koreanSentimentDictionary[j - 1][word]); } } } }
public string TokenizeSample1() { StringBuilder result = new StringBuilder(); var tokens = TwitterKoreanProcessorCS.Tokenize("토큰화를 처리하는 예제입니다"); foreach (var token in tokens) { result.AppendFormat(format: "{0}({1}) [{2},{3}] / ", args: new object[] { token.Text, token.Pos.ToString(), token.Offset, token.Length }); } // 토큰(ProperNoun) [0,2] / 화(Suffix) [2,1] / 를(Josa) [3,1] / ... / 입니(Adjective) [12,2] / 다(Eomi) [14,1] / return(result.ToString()); }
public String Tokenize(String input) { StringBuilder TokenizeResult = new StringBuilder(); var tokens = TwitterKoreanProcessorCS.Tokenize(input); foreach (var token in tokens) { if ((token.Pos.ToString()).Equals("Noun")) { TokenizeResult.Append(token.Text + " "); } } return(TokenizeResult.ToString()); }
//어근화 public string StemSample() { StringBuilder result = new StringBuilder(); var tokens = TwitterKoreanProcessorCS.Tokenize(text); var stemmedTokens = TwitterKoreanProcessorCS.Stem(tokens); foreach (var stemmedToken in stemmedTokens) { result.AppendFormat(format: "{0}({1}) [{2},{3}] / ", args: new object[] { stemmedToken.Text, stemmedToken.Pos.ToString(), stemmedToken.Offset, stemmedToken.Length }); } return(result.ToString()); }
public string StemSample1() { StringBuilder result = new StringBuilder(); var tokens = TwitterKoreanProcessorCS.Tokenize("토큰화 이후 어근화를 처리하는 예제입니다"); var stemmedTokens = TwitterKoreanProcessorCS.Stem(tokens); foreach (var stemmedToken in stemmedTokens) { result.AppendFormat(format: "{0}({1}) [{2},{3}] / ", args: new object[] { stemmedToken.Text, stemmedToken.Pos.ToString(), stemmedToken.Offset, stemmedToken.Length }); } // 토큰(ProperNoun) [0,2] / 화(Suffix) [2,1] / (Space) [3,1] / 이후(Noun) [4,2] / ... / 예제(Noun) [17,2] / 이다(Adjective) [19,3] / return(result.ToString()); }
public static string TokenTosTringSample(string saysth) { string result; if (saysth != "종료") { var tokens = TwitterKoreanProcessorCS.Tokenize(saysth); var results = TwitterKoreanProcessorCS.TokensToStrings(tokens); result = string.Join(" / ", results); } else { result = "안녕히 가세요"; } return(result); }
/// <summary> /// 감정 사전을 로딩하여 감정 분석기를 초기화합니다. /// </summary> public override void Initialize() { if (!(instance is null)) { return; } instance = this; #region Load TwitterKoreanProcessor (morpheme analysis engine) System.Threading.Tasks.Task.Run(() => { var a = TwitterKoreanProcessorCS.Normalize("초기화"); var b = TwitterKoreanProcessorCS.Tokenize(a); b = TwitterKoreanProcessorCS.Stem(b); var c = TwitterKoreanProcessorCS.TokensToStrings(b); }); #endregion #region Load Korean sentiment dictionaries /* * HangulSentiment1.csv is created by MorphemeParser(https://github.com/salt26/morpheme-parser). * The original data source is Korean Sentiment Lexicon(http://word.snu.ac.kr/kosac/lexicon.php). */ hangulSentimentCSV = new List <Util.CSVReader>(); koreanSentimentDictionary = new List <Dictionary <string, KoreanWordSentiment> >(); for (int i = 0; i < NUM_OF_GRAMS; i++) { hangulSentimentCSV.Add(new Util.CSVReader("HangulSentiment" + (i + 1) + ".csv", true)); koreanSentimentDictionary.Add(new Dictionary <string, KoreanWordSentiment>()); foreach (List <string> row in hangulSentimentCSV[i].GetData()) { koreanSentimentDictionary[i].Add(row[0], new KoreanWordSentiment(row[0], row[1], row[2], row[3], row[4])); } } Util.TaskQueue.Add("aggregateKoreanSentiment", InitializeAggregate); #endregion IsReady = true; }
public string ExtractPhraseSample1() { StringBuilder result = new StringBuilder(); var tokens = TwitterKoreanProcessorCS.Tokenize(text); var phrases = TwitterKoreanProcessorCS.ExtractPhrases(tokens); foreach (var phrase in phrases) { result.AppendLine("---------"); result.AppendFormat("{0} | ", phrase.Pos.ToString()); foreach (var token in phrase.Tokens) { result.AppendFormat(format: "{0}({1}) [{2},{3}] / ", args: new object[] { token.Text, token.Pos.ToString(), token.Offset, token.Length }); } result.AppendLine(); } return(result.ToString()); }
static void ReadFile(string filename) { try { StreamReader sr = new StreamReader(filename); while (sr.Peek() >= 0) { string strLine = sr.ReadLine(); string time = strLine.Substring(0, 14); string noTimeStr = strLine.Substring(strLine.IndexOf(' ')); string name = noTimeStr.Substring(1, noTimeStr.IndexOf(':') - 1); string chat = noTimeStr.Substring(noTimeStr.IndexOf(':') + 2); chat = TwitterKoreanProcessorCS.Normalize(chat); //normalize ex) 재밌닼ㅋㅋㅋㅋ -> 재밌다ㅋㅋ ('ㅋ'갯수를 두개로 줄여줌, 'ㅋ'이 받침에 들어간걸 없애줌) int hour = int.Parse(time.Substring(1, 2)); int min = int.Parse(time.Substring(4, 2)); int sec = int.Parse(time.Substring(7, 2)); Line chatLine = new Line(hour, min, sec, name, chat); chatLineArray.Add(chatLine); } sr.Close(); } catch (Exception ee) { Console.WriteLine(ee.Message); } //for(int i=0;i< chatLineArray.Count;i++) //{ // Console.WriteLine(chatLineArray[i].ToString()); //} }
} // 구글 파싱 public string StemSample1(string input_Search_Data) { var tokens = TwitterKoreanProcessorCS.Tokenize( TwitterKoreanProcessorCS.Normalize(input_Search_Data)); var stemmedTokens = TwitterKoreanProcessorCS.Stem(tokens); List <string> list = new List <string>(); List <string> second = new List <string>(); String sum = ""; foreach (var stemmedToken in stemmedTokens) { if (stemmedToken.Pos.ToString().Contains("Noun") || stemmedToken.Pos.ToString() == "Alpha" || stemmedToken.Pos.ToString() == "Punctuation" || stemmedToken.Pos.ToString() == "Adverb" || stemmedToken.Pos.ToString() == "Number") { list.Add(stemmedToken.Text); } } for (int i = 0; i < list.Count; i++) { if (list[i].Contains("제외")) { second.Add(" -" + list[i - 1]); second.RemoveAt(i - 1); } else if (list[i].Contains("혹은")) { second.Remove(second.Last()); second.Add(list[i - 1] + "|" + list[i + 1]); list.RemoveAt(i + 1); } else { second.Add(list[i]); } } sum = string.Join("", second); return(sum); } // 네이버 파싱
//////////////////////////////////////////////////////////////// // 형태소 분석 //////////////////////////////////////////////////////////////// public Tuple <string, string, bool> morphemeProcessor(string message, List <CMessage> list, bool isMainRoom) { Tuple <string, string, bool> emptyTuple = Tuple.Create("", "", false); string normalize = TwitterKoreanProcessorCS.Normalize(message); var morpheme = TwitterKoreanProcessorCS.Tokenize(normalize); var morphemeString = TwitterKoreanProcessorCS.TokensToStrings(morpheme); string command = ""; string contents = ""; //-------------------------------------------------------------------------- // 클랜 기능 감지 //-------------------------------------------------------------------------- string[] natural = FunctionCommand(normalize).Split(' '); bool isFirst = true; command = natural[0].ToString(); for (int i = 1; i < natural.Count(); i++) { if (isFirst == true) { contents += natural[i].ToString(); isFirst = false; } else { contents += " " + natural[i].ToString(); } } if (command != "") { return(Tuple.Create(command, contents, true)); } //-------------------------------------------------------------------------- // 영상 조회 //-------------------------------------------------------------------------- if (normalize.Contains("영상") == true || normalize.Contains("방송") == true) { bool isVideo = false; foreach (var word in enterCommand) { if (normalize.Contains(word) == true) { isVideo = true; } } if (isVideo == true && normalize.Contains("오늘") == true) { int day = 0; if (System.DateTime.Now.Hour < 6) { day = System.DateTime.Now.Day - 1; } else { day = System.DateTime.Now.Day; } string date = System.DateTime.Now.Year.ToString("D4") + System.DateTime.Now.Month.ToString("D2") + day.ToString("D2"); Tuple <string, string, bool> tuple = Tuple.Create("/영상", date, true); return(tuple); } else if (isVideo == true && normalize.Contains("어제") == true) { int day = 0; if (System.DateTime.Now.Hour < 6) { day = System.DateTime.Now.Day - 2; } else { day = System.DateTime.Now.Day - 1; } string date = System.DateTime.Now.Year.ToString("D4") + System.DateTime.Now.Month.ToString("D2") + day.ToString("D2"); Tuple <string, string, bool> tuple = Tuple.Create("/영상", date, true); return(tuple); } else if (isVideo == true) { Tuple <string, string, bool> tuple = Tuple.Create("/영상", System.DateTime.Now.Year.ToString("D4") + System.DateTime.Now.Month.ToString("D2"), true); return(tuple); } } //-------------------------------------------------------------------------- // 메뉴 조회 //-------------------------------------------------------------------------- Tuple <bool, bool> existMenu = isExistMenu(message, list); if (existMenu.Item1 == true) { return(Tuple.Create(getMenu(message, existMenu.Item2), "", false)); } //-------------------------------------------------------------------------- // 퇴근 응답 //-------------------------------------------------------------------------- if (message.Contains("퇴근") == true) { string offWork = offWorkCall(message); if (offWork != "") { return(Tuple.Create(offWork, "", false)); } } //-------------------------------------------------------------------------- // 날씨 감지 //-------------------------------------------------------------------------- if (message.Contains("날씨") == true) { Tuple <string, string> weatherTuple = weatherCall(message); if (weatherTuple.Item1 != "" && weatherTuple.Item2 != "") { return(Tuple.Create("/날씨", weatherTuple.Item2, true)); } } //-------------------------------------------------------------------------- // 아테나 정보 //-------------------------------------------------------------------------- if (message.Contains("아테나") == true) { string athenaInfo = AthenaInfo(message); if (athenaInfo != "") { return(Tuple.Create(athenaInfo, "", false)); } } //-------------------------------------------------------------------------- // 그 외 //-------------------------------------------------------------------------- int seed = 0; // 본 방이 아니면 여기서 반환 if (isMainRoom == false) { return(emptyTuple); } // 1/20 확률로 대답 Random ansRandom = new Random(unchecked ((int)DateTime.Now.Ticks) + seed++); if (ansRandom.Next(20) != 1) { return(emptyTuple); } foreach (var word in morpheme) { Random random = new Random(unchecked ((int)DateTime.Now.Ticks) + seed++); int num = random.Next(3); // 알 수 없는 단어일 경우 if (word.Unknown == true) { switch (num) { case 0: return(Tuple.Create(word.Text.ToString() + "? 그게 무슨 말이에요?", "", false)); case 1: return(Tuple.Create(word.Text.ToString() + "? 처음 듣는 말이네요.", "", false)); case 2: return(Tuple.Create(word.Text.ToString() + "? 무슨 말인지 모르겠어요.", "", false)); } } // 감탄사 if (word.Pos.ToString() == "Exclamation") { switch (num) { case 0: return(Tuple.Create("와, 정말 놀랍네요.", "", false)); case 1: return(Tuple.Create("저도 놀라워요.", "", false)); case 2: return(Tuple.Create("대박이네요.", "", false)); } } } string mention = ""; int arrIndex = 0; List <int> lstIndex = new List <int>(); foreach (var word in morpheme) { if (word.Pos.ToString() == "Noun" || word.Pos.ToString() == "ProperNoun") { lstIndex.Add(arrIndex); } arrIndex++; } if (lstIndex.Count() == 0) { return(emptyTuple); } Random wordRandom = new Random(unchecked ((int)DateTime.Now.Ticks) + seed++); int index = wordRandom.Next(lstIndex.Count()); var outputWord = morpheme.ElementAt(lstIndex.ElementAt(index)); mention = outputWord.Text.ToString(); if (mention == "") { return(emptyTuple); } // 이전 대화내용 참고 기능 foreach (var queMsg in list) { string time = ""; if (queMsg.Time.Minute >= System.DateTime.Now.Minute) { continue; } if ((queMsg.Time.Year == System.DateTime.Now.Year) && (queMsg.Time.Month == System.DateTime.Now.Month) && (queMsg.Time.Day == System.DateTime.Now.Day)) { time = "아까"; } else { time = "저번에"; } if (queMsg.Message.Contains(mention) == true) { Random queRandom = new Random(unchecked ((int)DateTime.Now.Ticks) + seed++); int queNumber = queRandom.Next(5); switch (queNumber) { case 0: return(Tuple.Create(time + " " + mention + "에 대해서 말씀하신 적 있어요. 관심있으신가봐요.", "", false)); case 1: return(Tuple.Create(time + " 말씀하신 " + mention + " 어떤가요?", "", false)); case 2: return(Tuple.Create(time + " " + mention + "에 대해 비슷한 말씀을 하셨었죠.", "", false)); case 3: return(Tuple.Create("자주 언급을 하시니 저도 " + mention + "에 대해서 관심을 가져볼까 해요.", "", false)); case 4: return(Tuple.Create("아, " + mention + "에 대해서 " + time + " 말씀하셨었어요. 흥미롭네요.", "", false)); } } } // 그냥 언급 Random mentionRandom = new Random(unchecked ((int)DateTime.Now.Ticks) + seed++); int mentionNumber = mentionRandom.Next(5); switch (mentionNumber) { case 0: return(Tuple.Create(mention + " 좋아하시나봐요.", "", false)); case 1: return(Tuple.Create(mention + ", 저도 궁금하네요.", "", false)); case 2: return(Tuple.Create(mention + " 어때요?", "", false)); case 3: return(Tuple.Create(mention + " 좋나요?", "", false)); case 4: return(Tuple.Create(mention + ". 흥미롭네요.", "", false)); } return(emptyTuple); }
private void samaf_internal_by_date(int starts, int ends, Func <Talk, bool> filter, Action <int> progress, bool using_tokenizer, bool include_noun, bool ignore_symbols, bool ignore_numeric) { var cc = (ends - starts) / 100; var mwords = new Dictionary <int, Dictionary <string, int> >(); for (int i = starts; i < ends; i++) { if (Talks[i].State == TalkState.Message || Talks[i].State == TalkState.Append) { if (!filter(Talks[i])) { continue; } var date = Talks[i].Time.Year * 100 + Talks[i].Time.Month; if (!mwords.ContainsKey(date)) { mwords.Add(date, new Dictionary <string, int>()); } if (using_tokenizer) { try { var tokens = TwitterKoreanProcessorCS.Tokenize(Talks[i].Content); var stem = TwitterKoreanProcessorCS.Stem(tokens); foreach (var word in stem) { if (word.Pos == KoreanPos.ProperNoun || (include_noun && word.Pos == KoreanPos.Noun)) { if (!mwords[date].ContainsKey(word.Text)) { mwords[date].Add(word.Text, 1); } else { mwords[date][word.Text] += 1; } } } } catch (Exception e) { ; } } else { var split = Talks[i].Content.Split(' '); split.Where(y => { double n; return(!double.TryParse(y, out n)); }).ToList().ForEach(y => { var str = y.Trim(); if (str == "") { return; } if ("?!.,".Contains(str.Last())) { str = str.Remove(str.Length - 1); } if (str == "") { return; } if (!mwords[date].ContainsKey(str)) { mwords[date].Add(str, 1); } else { mwords[date][str] += 1; } }); } } var x = System.Threading.Interlocked.Increment(ref cnt); if (cc == 0 || (i % cc == 0)) { progress(x); } } progress(cnt); lock (DateWords) { foreach (var ww in mwords) { if (!DateWords.ContainsKey(ww.Key)) { DateWords.Add(ww.Key, new Dictionary <string, int>()); } foreach (var w in ww.Value) { if (!DateWords[ww.Key].ContainsKey(w.Key)) { DateWords[ww.Key].Add(w.Key, w.Value); } else { DateWords[ww.Key][w.Key] += w.Value; } } } } }
Dictionary <String, int> totalNoun; //전체 내용중 명사 발생빈도 정보 (자료형 Dictionary<String,int>) public void GenerateSections(int _interval, ArrayList chatArray) // interval 몇초단위로 끊을것인가, charArray 채팅내용 저장된 Line배열 { interval = _interval; sectionArray = new ArrayList(); totalWord = new Dictionary <string, int>(); totalPosInfo = new Dictionary <string, int>(); totalNoun = new Dictionary <string, int>(); int endLineSec = ((Line)chatArray[chatArray.Count - 1]).hour * 3600 + ((Line)chatArray[chatArray.Count - 1]).min * 60 + ((Line)chatArray[chatArray.Count - 1]).sec; for (int i = 0; i <= endLineSec / interval; i++) { sectionArray.Add(new Section(i)); } int totalSec; int charArrayCount = chatArray.Count; for (int i = 0; i < charArrayCount; i++) { Line temp = (Line)chatArray[i]; totalSec = temp.hour * 3600 + temp.min * 60 + temp.sec; ((Section)sectionArray[totalSec / interval]).count++; ((Section)sectionArray[totalSec / interval]).addId(temp.name); var tokens = TwitterKoreanProcessorCS.Tokenize(temp.chat); try { tokens = TwitterKoreanProcessorCS.Stem(tokens); //일반형으로 변환 } catch (Exception e) { //stem 중에 애러나면 stem 하지않는다... //왜 애러나는지모름 // ex)'어 왜 벌써 61명남음?' //이라는 문장은 stem이 안된다 } foreach (var token in tokens) { ((Section)sectionArray[totalSec / interval]).addToken(token.Text, token.Pos); if (totalWord.ContainsKey(token.Text)) { totalWord[token.Text]++; } else { totalWord.Add(token.Text, 1); } if (totalPosInfo.ContainsKey(token.Pos.ToString())) { totalPosInfo[token.Pos.ToString()]++; } else { totalPosInfo.Add(token.Pos.ToString(), 1); } if (token.Pos == KoreanPos.Noun) { ((Section)sectionArray[totalSec / interval]).noun_count++; } if (token.Pos == KoreanPos.Noun) { if (totalNoun.ContainsKey(token.Text)) { totalNoun[token.Text]++; } else { totalNoun.Add(token.Text, 1); } } if (token.Text.Equals("ㅋㅋ")) { ((Section)sectionArray[totalSec / interval]).kk_count++; } else if (token.Pos == KoreanPos.Space) { ((Section)sectionArray[totalSec / interval]).Space_count++; } else if (token.Pos == KoreanPos.KoreanParticle) { ((Section)sectionArray[totalSec / interval]).KoreanParticle_count++; } else if (token.Pos == KoreanPos.ProperNoun) { ((Section)sectionArray[totalSec / interval]).ProperNoun_count++; } else if (token.Pos == KoreanPos.Verb) { ((Section)sectionArray[totalSec / interval]).Verb_count++; } else if (token.Pos == KoreanPos.Josa) { ((Section)sectionArray[totalSec / interval]).Josa_count++; } else if (token.Pos == KoreanPos.Punctuation) { ((Section)sectionArray[totalSec / interval]).Punctuation_count++; } else if (token.Pos == KoreanPos.Alpha) { ((Section)sectionArray[totalSec / interval]).Alpha_count++; } else if (token.Pos == KoreanPos.Number) { ((Section)sectionArray[totalSec / interval]).Number_count++; } } } }
//Normalize public string NormalizeSample() { var result = TwitterKoreanProcessorCS.Normalize(text); return(result); }
} // 삭제하기 //-----------------------< /region button >------------------------// #endregion /Button #region Parsing_Data //-----------------------< parsing data >-----------------------------// public string StemSample2(string input_Search_Data) { var tokens = TwitterKoreanProcessorCS.Tokenize( TwitterKoreanProcessorCS.Normalize(input_Search_Data)); var stemmedTokens = TwitterKoreanProcessorCS.Stem(tokens); List <string> list = new List <string>(); List <string> second = new List <string>(); List <string> third = new List <string>(); List <string> forth = new List <string>(); List <string> fifth = new List <string>(); List <string> final = new List <string>(); List <string> seq = new List <string>(); List <string> plus = new List <string>(); List <string> sub = new List <string>(); List <string> output = new List <string>(); List <string> input = new List <string>(); List <string> or = new List <string>(); String sum = ""; foreach (var stemmedToken in stemmedTokens) { if (stemmedToken.Pos.ToString().Contains("Noun") || stemmedToken.Pos.ToString() == "Alpha" || stemmedToken.Pos.ToString() == "Punctuation" || stemmedToken.Pos.ToString() == "Adverb" || stemmedToken.Pos.ToString() == "Number") { list.Add(stemmedToken.Text); } } for (int i = 0; i < list.Count; i++) { if (list[i].Contains("제외")) { second.Add(" -" + list[i - 1]); second.RemoveAt(i - 1); } else if (list[i].Contains("혹은")) { second.Remove(second.Last()); second.Add(list[i - 1] + "OR" + list[i + 1]); list.RemoveAt(i + 1); } else if (list[i].Contains("~")) { second.Remove(second.Last()); second.Add(list[i - 1] + ".." + list[i + 1]); list.RemoveAt(i + 1); } else { second.Add(list[i]); } } for (int i = 0; i < second.Count; i++) { if (second[i].Contains("형식")) { if (second[i - 1].Contains("ppt")) { second[i] = "filetype:ppt"; third.Add(second[i]); third.RemoveAt(i - 1); } else if (second[i - 1].Contains("pdf")) { second[i] = "filetype:pdf"; third.Add(second[i]); third.RemoveAt(i - 1); } else if (second[i - 1].Contains("xls")) { second[i] = "filetype:xls"; third.Add(second[i]); third.RemoveAt(i - 1); } else if (second[i - 1].Contains("doc")) { second[i] = "filetype:doc"; third.Add(second[i]); third.RemoveAt(i - 1); } } else { third.Add(second[i]); } } for (int i = 0; i < third.Count; i++) { if (third[i] == "인스타그램") { forth.Add("site:www.instagram.com"); } else if (third[i] == "네이버") { forth.Add("site:www.naver.com"); } else if (third[i] == "페이스북") { forth.Add("site:www.facebook.com"); } else if (third[i] == "구글") { forth.Add("site:www.google.com"); } else if (third[i] == "유튜브") { forth.Add("site:www.youtube.com"); } else { forth.Add(third[i]); } } for (int i = 0; i < forth.Count; i++) { if (forth[i] == "해시태그") { fifth.RemoveAt(i - 1); fifth.Add("%23" + forth[i - 1]); } else { fifth.Add(forth[i]); } } for (int i = 0; i < fifth.Count; i++) { if (fifth[i].Contains("-")) { sub.Add(fifth[i]); } else if (fifth[i].Contains("-")) { or.Add(fifth[i]); } else if (fifth[i].Contains("site") || fifth[i].Contains("filetype")) { seq.Add(fifth[i]); } else { plus.Add(fifth[i]); } } output.AddRange(plus); output.AddRange(or); output.AddRange(sub); output.AddRange(seq); for (int i = 0; i < output.Count; i++) { if (i == 0) { input.Add(output[i]); } else if (output[i].Contains("-")) { input.Add(output[i]); } else if (fifth[i].Contains("#")) { input.Add(output[i]); } else { input.Add("+" + output[i]); } } sum = string.Join("", input); return(sum); } // 구글 파싱
private void BgWorker_DoWork(object sender, DoWorkEventArgs e) { //selects the text encoding based on user selection Encoding SelectedEncoding = null; this.Invoke((MethodInvoker) delegate() { SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString()); }); //get the list of files var SearchDepth = SearchOption.TopDirectoryOnly; if (ScanSubfolderCheckbox.Checked) { SearchDepth = SearchOption.AllDirectories; } var files = Directory.EnumerateFiles(((string[])e.Argument)[0], "*.txt", SearchDepth); try { string outputdir = Path.Combine(((string[])e.Argument)[1]); Directory.CreateDirectory(outputdir); foreach (string fileName in files) { //set up our variables to report string Filename_Clean = Path.GetFileName(fileName); //report what we're working on FilenameLabel.Invoke((MethodInvoker) delegate { FilenameLabel.Text = "Analyzing: " + Filename_Clean; }); //do stuff here string readText = File.ReadAllText(fileName, SelectedEncoding).ToLower(); var TokenResults = TwitterKoreanProcessorCS.Tokenize(readText); StringBuilder Builder = new StringBuilder(); int tokenCount = TokenResults.Count(); for (int i = 0; i < tokenCount; i++) { if (TokenResults.ElementAt(i).Pos != KoreanPos.Space) { Builder.Append(TokenResults.ElementAt(i).Text + ' '); } } using (System.IO.StreamWriter fileout = new StreamWriter(Path.Combine(outputdir, Filename_Clean), false, SelectedEncoding)) { fileout.Write(Builder.ToString()); } } } catch { MessageBox.Show("KoToken encountered a problem while trying to tokenize/write a file."); } }