public string process_stopword_stem(string raw_text) // return string after stem and remove stopword
        {
            Iveonik.Stemmers.EnglishStemmer stemer = new Iveonik.Stemmers.EnglishStemmer();
            var           strArr      = Regex.Split(raw_text, @"\W+");
            int           size_strArr = strArr.Length;
            List <string> result      = new List <string>();

            for (int i = 0; i < size_strArr; i++)
            {
                bool flag = false; //<=> strArr[i] not in stopwords
                // remove stopword
                foreach (var sword in this.stopwords)
                {
                    if (String.Compare(strArr[i], sword, true) == 0)
                    {
                        flag = true;
                        break;
                    }
                }

                // if flag = false, stem(strArr[i]) and then push List result
                if (flag == false)
                {
                    string word = stemer.Stem(strArr[i]);
                    result.Add(word);
                }
            }
            return(String.Join(" ", result).Trim().ToLower());
        }
Example #2
0
 public void retrieveWordsEnglish()
 {
     Iveonik.Stemmers.IStemmer stemmer = new Iveonik.Stemmers.EnglishStemmer();
     string[] words = sentenceText.Split(new char[] { ' ', '\r', '\n', '\t', '.', ',', ':', ';', '!', '?', '\"', '_', '-', '(', ')', '|', '$', '%', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }, StringSplitOptions.RemoveEmptyEntries);
     for (int i = 0; i < words.Length; i++)
     {
         words[i] = stemmer.Stem(words[i]);
     }
     Words = words.ToList();
 }
        public static void MineTokenizeTweet1Gram(string targetSQLConnectionString, List <TweetText> tweets, string englishWordDictionaryPath, string stopWordsFilePath)
        {
            logger.Info("MineTokenizeTweet1Gram start");

            List <tmToken1Gram> textMiningResults = new List <tmToken1Gram>();

            Iveonik.Stemmers.EnglishStemmer englishStemmer = new Iveonik.Stemmers.EnglishStemmer();

            string englishWordDictionary = File.ReadAllText(englishWordDictionaryPath);
            string stopWordsFile         = File.ReadAllText(stopWordsFilePath);

            foreach (var tweet in tweets)
            {
                var words = tmRemoveSpecialCharactersFromText(tweet.Text, true).Split(' ');
                foreach (var word in words)
                {
                    tmToken1Gram result = new tmToken1Gram();
                    result.TweetID = tweet.ID;
                    result.Token   = word;

                    if (tmRemoveEnglishStopWords(word, stopWordsFile).Length > 0)
                    {
                        result.IsStopWord = true;
                    }
                    else
                    {
                        result.IsStopWord = false;
                    }

                    if (tmRemoveNonEnglishWords(word, englishWordDictionary).Length > 0)
                    {
                        result.IsEnglishWord = true;
                    }
                    else
                    {
                        result.IsEnglishWord = false;
                    }

                    if (tmRemoveNonEnglishWordsAndStopWords(word, englishWordDictionary, stopWordsFile).Length > 0)
                    {
                        result.IsNotEnglishWordAndNotStopWord = true;
                    }
                    else
                    {
                        result.IsNotEnglishWordAndNotStopWord = false;
                    }

                    result.IsNumber = tmCheckIfNumber(word);

                    if (word.StartsWith("@"))
                    {
                        result.IsAccountName = true;
                    }
                    else
                    {
                        result.IsAccountName = false;
                    }

                    if (word.StartsWith("#"))
                    {
                        result.IsHashtag = true;
                    }
                    else
                    {
                        result.IsHashtag = false;
                    }

                    if (word.StartsWith("www") || word.StartsWith("http") || word.StartsWith("https"))
                    {
                        result.IsWebsiteUrl = true;
                    }
                    else
                    {
                        result.IsWebsiteUrl = false;
                    }

                    if (result.IsEnglishWord == true && result.IsStopWord == false)
                    {
                        try
                        {
                            result.TokenRootWord = englishStemmer.Stem(word);
                        }
                        catch { }
                        finally
                        {
                            if (!(result.TokenRootWord.Length > 0))
                            {
                                result.TokenRootWord = "N/A";
                            }
                        }
                    }

                    textMiningResults.Add(result);
                }
            }

            SqlConnection conn = new SqlConnection(targetSQLConnectionString);
            SqlCommand    cmd;

            try
            {
                conn.Open();
            }
            catch (Exception exc)
            {
                logger.Error(exc);
            }

            if (conn.State == ConnectionState.Open)
            {
                foreach (var result in textMiningResults)
                {
                    cmd             = new SqlCommand("[Internal].[sp_InsertToken1Gram]", conn);
                    cmd.CommandType = CommandType.StoredProcedure;

                    cmd.Parameters.Add("@TweetID", SqlDbType.Int).Value                        = result.TweetID;
                    cmd.Parameters.Add("@Token", SqlDbType.NVarChar, 500).Value                = result.Token;
                    cmd.Parameters.Add("@TokenRootWord", SqlDbType.NVarChar, 500).Value        = result.TokenRootWord;
                    cmd.Parameters.Add("@IsEnglishWord", SqlDbType.Bit).Value                  = result.IsEnglishWord;
                    cmd.Parameters.Add("@IsStopWord", SqlDbType.Bit).Value                     = result.IsStopWord;
                    cmd.Parameters.Add("@IsNotEnglishWordAndNotStopWord", SqlDbType.Bit).Value = result.IsNotEnglishWordAndNotStopWord;
                    cmd.Parameters.Add("@IsHashtag", SqlDbType.Bit).Value                      = result.IsHashtag;
                    cmd.Parameters.Add("@IsAccountName", SqlDbType.Bit).Value                  = result.IsAccountName;
                    cmd.Parameters.Add("@IsNumber", SqlDbType.Bit).Value                       = result.IsNumber;
                    cmd.Parameters.Add("@IsWebsiteUrl", SqlDbType.Bit).Value                   = result.IsWebsiteUrl;

                    try
                    {
                        cmd.ExecuteNonQuery();
                    }
                    catch (Exception exc)
                    {
                        logger.Error(exc);
                    }
                }

                conn.Close();

                logger.Info("MineTokenizeTweet1Gram done");
            }
        }