/// <summary> /// Normalize an input buffer of Persian text /// </summary> /// <param name="s"> input buffer </param> /// <param name="len"> length of input buffer </param> /// <returns> length of input buffer after normalization </returns> public virtual int Normalize(char[] s, int len) { for (int i = 0; i < len; i++) { switch (s[i]) { case FARSI_YEH: case YEH_BARREE: s[i] = YEH; break; case KEHEH: s[i] = KAF; break; case HEH_YEH: case HEH_GOAL: s[i] = HEH; break; case HAMZA_ABOVE: // necessary for HEH + HAMZA len = StemmerUtil.Delete(s, i, len); i--; break; default: break; } } return(len); }
public override bool IncrementToken() { if (m_input.IncrementToken()) { char[] text = termAtt.Buffer; int length = termAtt.Length; for (int i = 0; i < length; i++) { char ch = text[i]; if (ch >= 0xFF01 && ch <= 0xFF5E) { // Fullwidth ASCII variants text[i] = (char)(text[i] - 0xFEE0); } else if (ch >= 0xFF65 && ch <= 0xFF9F) { // Halfwidth Katakana variants if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && Combine(text, i, ch)) { length = StemmerUtil.Delete(text, i--, length); } else { text[i] = KANA_NORM[ch - 0xFF65]; } } } termAtt.Length = length; return(true); } else { return(false); } }
/// <summary> /// Compose into standard form any compositions in the decompositions table. /// </summary> private int Compose(int ch0, Regex block0, ScriptData sd, char[] text, int pos, int len) { if (pos + 1 >= len) // need at least 2 chars! { return(len); } int ch1 = text[pos + 1] - sd.@base; var block1 = GetBlockForChar(text[pos + 1]); if (block1 != block0) // needs to be the same writing system { return(len); } int ch2 = -1; if (pos + 2 < len) { ch2 = text[pos + 2] - sd.@base; var block2 = GetBlockForChar(text[pos + 2]); if (text[pos + 2] == '\u200D') // ZWJ { ch2 = 0xFF; } else if (block2 != block1) // still allow a 2-char match { ch2 = -1; } } for (int i = 0; i < decompositions.Length; i++) { if (decompositions[i][0] == ch0 && (decompositions[i][4] & (int)sd.flag) != 0) { if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2)) { text[pos] = (char)(sd.@base + decompositions[i][3]); len = StemmerUtil.Delete(text, pos + 1, len); if (decompositions[i][2] >= 0) { len = StemmerUtil.Delete(text, pos + 1, len); } return(len); } } } return(len); }
private const char oe_se = '\u00F6'; //ö public override bool IncrementToken() { if (!m_input.IncrementToken()) { return(false); } char[] buffer = charTermAttribute.Buffer; int length = charTermAttribute.Length; int i; for (i = 0; i < length; i++) { if (buffer[i] == aa || buffer[i] == ae_se || buffer[i] == ae) { buffer[i] = 'a'; } else if (buffer[i] == AA || buffer[i] == AE_se || buffer[i] == AE) { buffer[i] = 'A'; } else if (buffer[i] == oe || buffer[i] == oe_se) { buffer[i] = 'o'; } else if (buffer[i] == OE || buffer[i] == OE_se) { buffer[i] = 'O'; } else if (length - 1 > i) { if ((buffer[i] == 'a' || buffer[i] == 'A') && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A' || buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O')) { length = StemmerUtil.Delete(buffer, i + 1, length); } else if ((buffer[i] == 'o' || buffer[i] == 'O') && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O')) { length = StemmerUtil.Delete(buffer, i + 1, length); } } } charTermAttribute.Length = length; return(true); }
/// <summary> /// Normalize an input buffer of Arabic text /// </summary> /// <param name="s"> input buffer </param> /// <param name="len"> length of input buffer </param> /// <returns> length of input buffer after normalization </returns> public virtual int Normalize(char[] s, int len) { for (int i = 0; i < len; i++) { switch (s[i]) { case ALEF_MADDA: case ALEF_HAMZA_ABOVE: case ALEF_HAMZA_BELOW: s[i] = ALEF; break; case DOTLESS_YEH: s[i] = YEH; break; case TEH_MARBUTA: s[i] = HEH; break; case TATWEEL: case KASRATAN: case DAMMATAN: case FATHATAN: case FATHA: case DAMMA: case KASRA: case SHADDA: case SUKUN: len = StemmerUtil.Delete(s, i, len); i--; break; default: break; } } return(len); }
private int Norm2(char[] s, int len) { if (len > 8) { if (s[len - 1] == 'e' || s[len - 1] == 'o' || s[len - 1] == 'u') { len--; } } if (len > 4) { if (s[len - 1] == 'i') { len--; } if (len > 4) { char ch = s[0]; for (int i = 1; i < len; i++) { if (s[i] == ch && (ch == 'k' || ch == 'p' || ch == 't')) { len = StemmerUtil.Delete(s, i--, len); } else { ch = s[i]; } } } } return(len); }
private int Norm(char[] s, int len) { if (len > 4) { for (int i = 0; i < len; i++) { switch (s[i]) { case 'à': case 'á': case 'â': s[i] = 'a'; break; case 'ô': s[i] = 'o'; break; case 'è': case 'é': case 'ê': s[i] = 'e'; break; case 'ù': case 'û': s[i] = 'u'; break; case 'î': s[i] = 'i'; break; case 'ç': s[i] = 'c'; break; } } char ch = s[0]; for (int i = 1; i < len; i++) { if (s[i] == ch && char.IsLetter(ch)) { len = StemmerUtil.Delete(s, i--, len); } else { ch = s[i]; } } } if (len > 4 && StemmerUtil.EndsWith(s, len, "ie")) { len -= 2; } if (len > 4) { if (s[len - 1] == 'r') { len--; } if (s[len - 1] == 'e') { len--; } if (s[len - 1] == 'e') { len--; } if (s[len - 1] == s[len - 2] && char.IsLetter(s[len - 1])) { len--; } } return(len); }
public override bool IncrementToken() { if (input.IncrementToken()) { int state = N; char[] buffer = termAtt.Buffer(); int length = termAtt.Length; for (int i = 0; i < length; i++) { char c = buffer[i]; switch (c) { case 'a': case 'o': state = U; break; case 'u': state = (state == N) ? U : V; break; case 'e': if (state == U) { length = StemmerUtil.Delete(buffer, i--, length); } state = V; break; case 'i': case 'q': case 'y': state = V; break; case 'ä': buffer[i] = 'a'; state = V; break; case 'ö': buffer[i] = 'o'; state = V; break; case 'ü': buffer[i] = 'u'; state = V; break; case 'ß': buffer[i++] = 's'; buffer = termAtt.ResizeBuffer(1 + length); if (i < length) { Array.Copy(buffer, i, buffer, i + 1, (length - i)); } buffer[i] = 's'; length++; state = N; break; default: state = N; break; } } termAtt.Length = length; return(true); } else { return(false); } }
/// <summary> /// Normalize an input buffer of Hindi text /// </summary> /// <param name="s"> input buffer </param> /// <param name="len"> length of input buffer </param> /// <returns> length of input buffer after normalization </returns> public virtual int normalize(char[] s, int len) { for (int i = 0; i < len; i++) { switch (s[i]) { // dead n -> bindu case '\u0928': if (i + 1 < len && s[i + 1] == '\u094D') { s[i] = '\u0902'; len = StemmerUtil.Delete(s, i + 1, len); } break; // candrabindu -> bindu case '\u0901': s[i] = '\u0902'; break; // nukta deletions case '\u093C': len = StemmerUtil.Delete(s, i, len); i--; break; case '\u0929': s[i] = '\u0928'; break; case '\u0931': s[i] = '\u0930'; break; case '\u0934': s[i] = '\u0933'; break; case '\u0958': s[i] = '\u0915'; break; case '\u0959': s[i] = '\u0916'; break; case '\u095A': s[i] = '\u0917'; break; case '\u095B': s[i] = '\u091C'; break; case '\u095C': s[i] = '\u0921'; break; case '\u095D': s[i] = '\u0922'; break; case '\u095E': s[i] = '\u092B'; break; case '\u095F': s[i] = '\u092F'; break; // zwj/zwnj -> delete case '\u200D': case '\u200C': len = StemmerUtil.Delete(s, i, len); i--; break; // virama -> delete case '\u094D': len = StemmerUtil.Delete(s, i, len); i--; break; // chandra/short -> replace case '\u0945': case '\u0946': s[i] = '\u0947'; break; case '\u0949': case '\u094A': s[i] = '\u094B'; break; case '\u090D': case '\u090E': s[i] = '\u090F'; break; case '\u0911': case '\u0912': s[i] = '\u0913'; break; case '\u0972': s[i] = '\u0905'; break; // long -> short ind. vowels case '\u0906': s[i] = '\u0905'; break; case '\u0908': s[i] = '\u0907'; break; case '\u090A': s[i] = '\u0909'; break; case '\u0960': s[i] = '\u090B'; break; case '\u0961': s[i] = '\u090C'; break; case '\u0910': s[i] = '\u090F'; break; case '\u0914': s[i] = '\u0913'; break; // long -> short dep. vowels case '\u0940': s[i] = '\u093F'; break; case '\u0942': s[i] = '\u0941'; break; case '\u0944': s[i] = '\u0943'; break; case '\u0963': s[i] = '\u0962'; break; case '\u0948': s[i] = '\u0947'; break; case '\u094C': s[i] = '\u094B'; break; default: break; } } return(len); }
/// <summary> /// Normalize an input buffer of Sorani text /// </summary> /// <param name="s"> input buffer </param> /// <param name="len"> length of input buffer </param> /// <returns> length of input buffer after normalization </returns> public virtual int Normalize(char[] s, int len) { for (int i = 0; i < len; i++) { switch (s[i]) { case YEH: case DOTLESS_YEH: s[i] = FARSI_YEH; break; case KAF: s[i] = KEHEH; break; case ZWNJ: if (i > 0 && s[i - 1] == HEH) { s[i - 1] = AE; } len = StemmerUtil.Delete(s, i, len); i--; break; case HEH: if (i == len - 1) { s[i] = AE; } break; case TEH_MARBUTA: s[i] = AE; break; case HEH_DOACHASHMEE: s[i] = HEH; break; case REH: if (i == 0) { s[i] = RREH; } break; case RREH_ABOVE: s[i] = RREH; break; case TATWEEL: case KASRATAN: case DAMMATAN: case FATHATAN: case FATHA: case DAMMA: case KASRA: case SHADDA: case SUKUN: len = StemmerUtil.Delete(s, i, len); i--; break; default: if (CharUnicodeInfo.GetUnicodeCategory(s[i]) == UnicodeCategory.Format) { len = StemmerUtil.Delete(s, i, len); i--; } break; } } return(len); }