/// <summary> /// Normalize an input buffer of Persian text /// </summary> /// <param name="s"> input buffer </param> /// <param name="len"> length of input buffer </param> /// <returns> length of input buffer after normalization </returns> public virtual int normalize(char[] s, int len) { for (int i = 0; i < len; i++) { switch (s[i]) { case FARSI_YEH: case YEH_BARREE: s[i] = YEH; break; case KEHEH: s[i] = KAF; break; case HEH_YEH: case HEH_GOAL: s[i] = HEH; break; case HAMZA_ABOVE: // necessary for HEH + HAMZA len = StemmerUtil.delete(s, i, len); i--; break; default: break; } } return(len); }
private const char oe_se = '\u00F6'; //ö //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (!input.incrementToken()) { return(false); } char[] buffer = charTermAttribute.buffer(); int length = charTermAttribute.length(); int i; for (i = 0; i < length; i++) { if (buffer[i] == aa || buffer[i] == ae_se || buffer[i] == ae) { buffer[i] = 'a'; } else if (buffer[i] == AA || buffer[i] == AE_se || buffer[i] == AE) { buffer[i] = 'A'; } else if (buffer[i] == oe || buffer[i] == oe_se) { buffer[i] = 'o'; } else if (buffer[i] == OE || buffer[i] == OE_se) { buffer[i] = 'O'; } else if (length - 1 > i) { if ((buffer[i] == 'a' || buffer[i] == 'A') && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A' || buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O')) { length = StemmerUtil.delete(buffer, i + 1, length); } else if ((buffer[i] == 'o' || buffer[i] == 'O') && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O')) { length = StemmerUtil.delete(buffer, i + 1, length); } } } charTermAttribute.Length = length; return(true); }
/// <summary> /// Normalize an input buffer of Arabic text /// </summary> /// <param name="s"> input buffer </param> /// <param name="len"> length of input buffer </param> /// <returns> length of input buffer after normalization </returns> public virtual int normalize(char[] s, int len) { for (int i = 0; i < len; i++) { switch (s[i]) { case ALEF_MADDA: case ALEF_HAMZA_ABOVE: case ALEF_HAMZA_BELOW: s[i] = ALEF; break; case DOTLESS_YEH: s[i] = YEH; break; case TEH_MARBUTA: s[i] = HEH; break; case TATWEEL: case KASRATAN: case DAMMATAN: case FATHATAN: case FATHA: case DAMMA: case KASRA: case SHADDA: case SUKUN: len = StemmerUtil.delete(s, i, len); i--; break; default: break; } } return(len); }
private int norm2(char[] s, int len) { if (len > 8) { if (s[len - 1] == 'e' || s[len - 1] == 'o' || s[len - 1] == 'u') { len--; } } if (len > 4) { if (s[len - 1] == 'i') { len--; } if (len > 4) { char ch = s[0]; for (int i = 1; i < len; i++) { if (s[i] == ch && (ch == 'k' || ch == 'p' || ch == 't')) { len = StemmerUtil.delete(s, i--, len); } else { ch = s[i]; } } } } return(len); }
private int norm(char[] s, int len) { if (len > 4) { for (int i = 0; i < len; i++) { switch (s[i]) { case 'à': case 'á': case 'â': s[i] = 'a'; break; case 'ô': s[i] = 'o'; break; case 'è': case 'é': case 'ê': s[i] = 'e'; break; case 'ù': case 'û': s[i] = 'u'; break; case 'î': s[i] = 'i'; break; case 'ç': s[i] = 'c'; break; } } char ch = s[0]; for (int i = 1; i < len; i++) { if (s[i] == ch && char.IsLetter(ch)) { len = StemmerUtil.delete(s, i--, len); } else { ch = s[i]; } } } if (len > 4 && StemmerUtil.EndsWith(s, len, "ie")) { len -= 2; } if (len > 4) { if (s[len - 1] == 'r') { len--; } if (s[len - 1] == 'e') { len--; } if (s[len - 1] == 'e') { len--; } if (s[len - 1] == s[len - 2] && char.IsLetter(s[len - 1])) { len--; } } return(len); }
/// <summary> /// Normalize an input buffer of Hindi text /// </summary> /// <param name="s"> input buffer </param> /// <param name="len"> length of input buffer </param> /// <returns> length of input buffer after normalization </returns> public virtual int normalize(char[] s, int len) { for (int i = 0; i < len; i++) { switch (s[i]) { // dead n -> bindu case '\u0928': if (i + 1 < len && s[i + 1] == '\u094D') { s[i] = '\u0902'; len = StemmerUtil.delete(s, i + 1, len); } break; // candrabindu -> bindu case '\u0901': s[i] = '\u0902'; break; // nukta deletions case '\u093C': len = StemmerUtil.delete(s, i, len); i--; break; case '\u0929': s[i] = '\u0928'; break; case '\u0931': s[i] = '\u0930'; break; case '\u0934': s[i] = '\u0933'; break; case '\u0958': s[i] = '\u0915'; break; case '\u0959': s[i] = '\u0916'; break; case '\u095A': s[i] = '\u0917'; break; case '\u095B': s[i] = '\u091C'; break; case '\u095C': s[i] = '\u0921'; break; case '\u095D': s[i] = '\u0922'; break; case '\u095E': s[i] = '\u092B'; break; case '\u095F': s[i] = '\u092F'; break; // zwj/zwnj -> delete case '\u200D': case '\u200C': len = StemmerUtil.delete(s, i, len); i--; break; // virama -> delete case '\u094D': len = StemmerUtil.delete(s, i, len); i--; break; // chandra/short -> replace case '\u0945': case '\u0946': s[i] = '\u0947'; break; case '\u0949': case '\u094A': s[i] = '\u094B'; break; case '\u090D': case '\u090E': s[i] = '\u090F'; break; case '\u0911': case '\u0912': s[i] = '\u0913'; break; case '\u0972': s[i] = '\u0905'; break; // long -> short ind. vowels case '\u0906': s[i] = '\u0905'; break; case '\u0908': s[i] = '\u0907'; break; case '\u090A': s[i] = '\u0909'; break; case '\u0960': s[i] = '\u090B'; break; case '\u0961': s[i] = '\u090C'; break; case '\u0910': s[i] = '\u090F'; break; case '\u0914': s[i] = '\u0913'; break; // long -> short dep. vowels case '\u0940': s[i] = '\u093F'; break; case '\u0942': s[i] = '\u0941'; break; case '\u0944': s[i] = '\u0943'; break; case '\u0963': s[i] = '\u0962'; break; case '\u0948': s[i] = '\u0947'; break; case '\u094C': s[i] = '\u094B'; break; default: break; } } return(len); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (input.incrementToken()) { int state = N; char[] buffer = termAtt.buffer(); int length = termAtt.length(); for (int i = 0; i < length; i++) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char c = buffer[i]; char c = buffer[i]; switch (c) { case 'a': case 'o': state = U; break; case 'u': state = (state == N) ? U : V; break; case 'e': if (state == U) { length = StemmerUtil.delete(buffer, i--, length); } state = V; break; case 'i': case 'q': case 'y': state = V; break; case 'ä': buffer[i] = 'a'; state = V; break; case 'ö': buffer[i] = 'o'; state = V; break; case 'ü': buffer[i] = 'u'; state = V; break; case 'ß': buffer[i++] = 's'; buffer = termAtt.resizeBuffer(1 + length); if (i < length) { Array.Copy(buffer, i, buffer, i + 1, (length - i)); } buffer[i] = 's'; length++; state = N; break; default: state = N; break; } } termAtt.Length = length; return(true); } else { return(false); } }