コード例 #1
0
        public virtual string GetSenseKey(string lemma, string partOfSpeech, int sense)
        {
            IndexWord indexWord = mEngine.GetIndexWord(lemma, "noun");

            if (indexWord == null)
            {
                return(null);
            }
            return(indexWord.SynsetOffsets[sense].ToString(System.Globalization.CultureInfo.InvariantCulture));
        }
コード例 #2
0
        public virtual int GetSenseCount(string lemma, string pos)
        {
            IndexWord indexWord = mEngine.GetIndexWord(lemma, "noun");

            if (indexWord == null)
            {
                return(0);
            }

            return(indexWord.SenseCount);
        }
コード例 #3
0
        public virtual int getNumSenses(string lemma, string pos)
        {
            //try
            //{
            IndexWord indexWord = mEngine.GetIndexWord(lemma, "noun");

            //IndexWord indexWord = dict.getIndexWord(POS.NOUN, lemma);
            if (indexWord == null)
            {
                return(0);
            }
            //return indexWord.getSenseCount();
            return(indexWord.SenseCount);
            //}
            //catch (JWNLException e)
            //{
            //    return 0;
            //}
            //return 0;
        }
コード例 #4
0
        public virtual string getSenseKey(string lemma, string pos, int sense)
        {
            //try
            //{
            IndexWord indexWord = mEngine.GetIndexWord(lemma, "noun");

            //IndexWord indexWord = dict.getIndexWord(POS.NOUN, lemma);
            if (indexWord == null)
            {
                return(null);
            }
            //return System.Convert.ToString(indexWord.getSynsetOffsets()[sense]);
            return(indexWord.SynsetOffsets[sense].ToString(System.Globalization.CultureInfo.InvariantCulture));
            //}
            //catch (JWNLException e)
            //{
            //    e.printStackTrace();
            //    return null;
            //}
            //return null;
        }
コード例 #5
0
        public static IEnumerable WordBreaker3(SqlString text, SqlInt16 minlen, SqlInt16 repmax, SqlBoolean hashtaguser, SqlInt16 maxlen)
        {
            var words = new List <IndexWord>();

            string _text = (string)text;
            int    count = _text.Length;

            const string whitespace  = " \t\n\r";
            const string punctuation = ",.!?;:\"-/+(){}[]\\`?_^~<>";

            short word_idx = 0;     //word count
            int   ws       = 0;     // word start
            int   wc       = 0;     // word letter count
            char  ll       = '\0';  // last letter processed
            uint  lc       = 0;     // same letter count
            bool  isnoisy  = false;
            bool  isurl    = false;

            byte[] bsource = new byte[_text.Length * sizeof(char)];
            System.Buffer.BlockCopy(_text.ToCharArray(), 0, bsource, 0, bsource.Length);
            int i = 0;


            char c;
            byte bu            = 0x00;
            byte bl            = 0x00;
            bool isWhitespace  = false;
            bool isPunctuation = false;

            if (count > 2)
            {
                // Check unicode signature, skip if any
                if (bsource[0] == 0xFF || bsource[0] == 0xFE)
                {
                    i = 1;
                }
            }
            else
            {
                count = 0;
                yield break;
            }

            // Read input letter by letter and put together into the word buffer
            for (; i <= count; i++)
            {
                if (i == count)
                {
                    c = '\0';
                }
                else
                {
                    c  = _text[i];
                    bu = bsource[2 * i];
                    bl = bsource[2 * i + 1];

                    isWhitespace  = whitespace.IndexOf(c) != -1;
                    isPunctuation = punctuation.IndexOf(c) != -1;
                }

                if (isWhitespace || isPunctuation || c == '\0')
                {
                    if (wc >= (int)minlen && !isnoisy && !isurl)
                    {
                        if (maxlen == 0 || wc <= (int)maxlen)
                        {
                            IndexWord tmp = new IndexWord();
                            tmp.Word = _text.Substring(ws, wc);
                            tmp.Pos  = (short)ws;
                            tmp.ID   = word_idx++;
                            yield return(tmp);
                        }
                    }

                    wc      = 0;
                    isnoisy = false;
                    if (isWhitespace)
                    {
                        isurl = false;
                    }

                    if (c == '\0')
                    {
                        break;
                    }
                }
                else if (i < count - 4 &&
                         (_text[i] == 'h' || _text[i] == 'H') &&
                         (_text[i + 1] == 't' || _text[i + 1] == 'T') &&
                         (_text[i + 2] == 't' || _text[i + 2] == 'T') &&
                         (_text[i + 3] == 'p' || _text[i + 3] == 'P'))
                {
                    isurl = true;
                }
                else
                {
                    if (c == ll || c == ll + 0x20 || c == ll - 0x20)    // do not distinguish upper and lower case, might cause some false negatives, but only a few
                    {
                        lc++;
                    }
                    else
                    {
                        ll = c;
                        lc = 1;
                    }

                    isnoisy |= lc > (int)repmax;                                                        // letter repeated more than trice

                    if (wc == 0 && hashtaguser)
                    {
                        if (c == '#' || c == '@')
                        {
                            goto nosymbols;
                        }
                    }
                    isnoisy |= (bl > 0x05);                             // Not latin greek or cyrillic
                    isnoisy |= (bl == 0x00 && bu < 0x41);               // Symbols
                    isnoisy |= (bl == 0x00 && 0x7B <= bu && bu < 0xC0); // More symbols
                    isnoisy |= (bl == 0x03 && bu < 0x80);               // More symbols
nosymbols:

                    // append to the word
                    if (wc == 0)
                    {
                        ws = i;
                    }
                    wc++;
                }
            }

            yield break;
        }