public virtual string GetSenseKey(string lemma, string partOfSpeech, int sense) { IndexWord indexWord = mEngine.GetIndexWord(lemma, "noun"); if (indexWord == null) { return(null); } return(indexWord.SynsetOffsets[sense].ToString(System.Globalization.CultureInfo.InvariantCulture)); }
public virtual int GetSenseCount(string lemma, string pos) { IndexWord indexWord = mEngine.GetIndexWord(lemma, "noun"); if (indexWord == null) { return(0); } return(indexWord.SenseCount); }
public virtual int getNumSenses(string lemma, string pos) { //try //{ IndexWord indexWord = mEngine.GetIndexWord(lemma, "noun"); //IndexWord indexWord = dict.getIndexWord(POS.NOUN, lemma); if (indexWord == null) { return(0); } //return indexWord.getSenseCount(); return(indexWord.SenseCount); //} //catch (JWNLException e) //{ // return 0; //} //return 0; }
public virtual string getSenseKey(string lemma, string pos, int sense) { //try //{ IndexWord indexWord = mEngine.GetIndexWord(lemma, "noun"); //IndexWord indexWord = dict.getIndexWord(POS.NOUN, lemma); if (indexWord == null) { return(null); } //return System.Convert.ToString(indexWord.getSynsetOffsets()[sense]); return(indexWord.SynsetOffsets[sense].ToString(System.Globalization.CultureInfo.InvariantCulture)); //} //catch (JWNLException e) //{ // e.printStackTrace(); // return null; //} //return null; }
public static IEnumerable WordBreaker3(SqlString text, SqlInt16 minlen, SqlInt16 repmax, SqlBoolean hashtaguser, SqlInt16 maxlen) { var words = new List <IndexWord>(); string _text = (string)text; int count = _text.Length; const string whitespace = " \t\n\r"; const string punctuation = ",.!?;:\"-/+(){}[]\\`?_^~<>"; short word_idx = 0; //word count int ws = 0; // word start int wc = 0; // word letter count char ll = '\0'; // last letter processed uint lc = 0; // same letter count bool isnoisy = false; bool isurl = false; byte[] bsource = new byte[_text.Length * sizeof(char)]; System.Buffer.BlockCopy(_text.ToCharArray(), 0, bsource, 0, bsource.Length); int i = 0; char c; byte bu = 0x00; byte bl = 0x00; bool isWhitespace = false; bool isPunctuation = false; if (count > 2) { // Check unicode signature, skip if any if (bsource[0] == 0xFF || bsource[0] == 0xFE) { i = 1; } } else { count = 0; yield break; } // Read input letter by letter and put together into the word buffer for (; i <= count; i++) { if (i == count) { c = '\0'; } else { c = _text[i]; bu = bsource[2 * i]; bl = bsource[2 * i + 1]; isWhitespace = whitespace.IndexOf(c) != -1; isPunctuation = punctuation.IndexOf(c) != -1; } if (isWhitespace || isPunctuation || c == '\0') { if (wc >= (int)minlen && !isnoisy && !isurl) { if (maxlen == 0 || wc <= (int)maxlen) { IndexWord tmp = new IndexWord(); tmp.Word = _text.Substring(ws, wc); tmp.Pos = (short)ws; tmp.ID = word_idx++; yield return(tmp); } } wc = 0; isnoisy = false; if (isWhitespace) { isurl = false; } if (c == '\0') { break; } } else if (i < count - 4 && (_text[i] == 'h' || _text[i] == 'H') && (_text[i + 1] == 't' || _text[i + 1] == 'T') && (_text[i + 2] == 't' || _text[i + 2] == 'T') && (_text[i + 3] == 'p' || _text[i + 3] == 'P')) { isurl = true; } else { if (c == ll || c == ll + 0x20 || c == ll - 0x20) // do not distinguish upper and lower case, might cause some false negatives, but only a few { lc++; } else { ll = c; lc = 1; } isnoisy |= lc > (int)repmax; // letter repeated more than trice if (wc == 0 && hashtaguser) { if (c == '#' || c == '@') { goto nosymbols; } } isnoisy |= (bl > 0x05); // Not latin greek or cyrillic isnoisy |= (bl == 0x00 && bu < 0x41); // Symbols isnoisy |= (bl == 0x00 && 0x7B <= bu && bu < 0xC0); // More symbols isnoisy |= (bl == 0x03 && bu < 0x80); // More symbols nosymbols: // append to the word if (wc == 0) { ws = i; } wc++; } } yield break; }