public WordChain Scan(string inputstring) { this.inputString = inputstring; this.stringIndex = 0; WordChain wChain = new WordChain(); WordTable wordT; // inputStringから語のchainを作成する。 while (true) { // inputStringから1語切り出す wordT = PicupWordTable(); if (wordT.word == "") { break; // 取り出せなかったら終了 } wChain.Add(wordT); } // end of while ( PicupWordTable loop ) // 漢字+ひらがな などいくつかの並びはまとめる。 ( wordT merge loop ) CharCategory prevCharCategory = CharCategory.Null; WordTable prevwordT = new WordTable(); wordT = wChain.Head; while (wordT != null) { CharCategory mergedCharCategory; mergedCharCategory = IsMergedable(prevCharCategory, wordT.charCategory); if (mergedCharCategory != CharCategory.Null) { WordTable newWt = new WordTable(prevwordT.word + wordT.word);; newWt.charCategory = mergedCharCategory; newWt.posCategory = PosCategory.Other; wChain.Delete(prevwordT); wChain.Insert(wordT, newWt); wChain.Delete(wordT); wordT = newWt; } prevCharCategory = wordT.charCategory; prevwordT = wordT; wordT = wordT.next; // 次の語へ } // end of while ( wordT merge loop ) return(wChain); }
/// <summary> /// Returns the category of a Char /// <para xml:lang="es"> /// Devuelve la categoría de un Char /// </para> /// </summary> /// <param name="c"> /// Which returns the character category /// <para xml:lang="es"> /// Carácter del cual devuelve la categoría /// </para> /// </param> /// <returns> /// category of a Char /// <para xml:lang="es"> /// categoria del caracter /// </para> /// </returns> public static CharCategory Category(this char c) { //Local vars CharCategory category = CharCategory.Unknown; //Evaluating char category if (Char.IsControl(c)) { category = CharCategory.Control; } else if (Char.IsDigit(c)) { category = CharCategory.Digit; } else if (Char.IsLetter(c)) { category = CharCategory.Letter; } else if (Char.IsNumber(c)) { category = CharCategory.Number; } else if (Char.IsPunctuation(c)) { category = CharCategory.Punctuation; } else if (Char.IsSeparator(c)) { category = CharCategory.Separator; } else if (Char.IsSurrogate(c)) { category = CharCategory.Surrogate; } else if (Char.IsSymbol(c)) { category = CharCategory.Symbol; } else if (Char.IsWhiteSpace(c)) { category = CharCategory.Whitespace; } //Returning char category return(category); }
private int cost; // transWordのcost //----------------------------------------------------------- // constructor public WordTable() { this.prev = null; this.next = null; this.word = ""; this.transWord = ""; this.charCategory = CharCategory.Null; this.posCategory = PosCategory.Null; this.sResult = null; this.divided = Divided.Non; this.cost = 100; // 未知語 }
//--------------------------------------------------------------------- // 漢字+ひらがな(ex. 走る) 英字+数字(ex. MP3)は1つのテーブルに // まとめるための判定 private CharCategory IsMergedable(CharCategory prev, CharCategory here) { if (prev == CharCategory.Kanji && // 漢字+ひらがな(ex. 走る) here == CharCategory.Hiragana) { return(CharCategory.KanHira); } if (prev == CharCategory.Letter && // 英字+数字(ex. MP3) here == CharCategory.Digit) { return(CharCategory.LetterDigit); } if (prev == CharCategory.Letter && // 英字+漢字(ex. A社 ) here == CharCategory.Kanji) { return(CharCategory.LetterMix); } // if(prev==CharCategory.Letter && // 英字+ハングル (ex. A사 ) // here==CharCategory.Hangul ){ // return CharCategory.LetterMix ; // } return(CharCategory.Null); }
//-------------------------------------------------------------- // 非常に簡易的な文字列分解。文字種が変わった所で切る。 // this.inputString から文字を取り出し // 語(と思われる)単位にまとめて,その語をWordTableで返す。 private WordTable PicupWordTable() { StringBuilder rtn_str = new StringBuilder(); rtn_str.Length = 0; bool inHTMLtag = false; // Tagの中か否か? // 採取取り出し文字の文字種 CharCategory lastCharCategory = CharCategory.Null; // 取り出す語の文字種 CharCategory charCategory = CharCategory.Null; WordTable wordT = new WordTable(); while (true) { if (this.stringIndex >= this.inputString.Length) { break; } // 入力文字列のinstr_inex位置から 1文字取り出し char ch = this.inputString[this.stringIndex]; // 取り出した文字の種別を判定 lastCharCategory = GetCharCategory(ch); // 例えば <font size=+1>の "<"から ">"までを // HTMLtagというカテゴリにまとめる。 if (this.htmltext) { if (ch == '<') { inHTMLtag = true; } if (inHTMLtag) { lastCharCategory = CharCategory.HTMLtag; } if (ch == '>') { inHTMLtag = false; } } if (charCategory != CharCategory.Null) { if (charCategory != lastCharCategory) { break; // 文字種が変わったら抜ける } } else { charCategory = lastCharCategory; } rtn_str.Append(ch.ToString()); this.stringIndex++; // 取り出し位置を進める } wordT.word = rtn_str.ToString(); // 文字列の設定 wordT.charCategory = charCategory; // 文字種別の設定 if (charCategory == CharCategory.Katakana) { wordT.posCategory = PosCategory.Noun; } else if (charCategory == CharCategory.HiraganaWo) { wordT.posCategory = PosCategory.PP; wordT.Cost = 0; } else if (charCategory == CharCategory.Other) { wordT.posCategory = PosCategory.Other; wordT.Cost = 0; } else if (charCategory == CharCategory.Separator) { wordT.posCategory = PosCategory.Other; wordT.Cost = 0; } else { wordT.posCategory = PosCategory.Other; } return(wordT); }