Пример #1
0
        private static List <string> TokenizeTweetUser(string content, TokenizeConfig config)
        {
            content = content.ToLower();
            content = RemoveContentNoise.RemoveTweetTokenizeNoise(content);

            var sep      = TweetSeperator;
            var tokens   = content.Split(sep, StringSplitOptions.RemoveEmptyEntries);
            var stophash = Util.GetHashSet(config.StopWords);

            List <string> words = new List <string>();

            foreach (var token in tokens)
            {
                if (token.StartsWith("@"))
                {
                    var word = token;
                    word = word.Substring(1);

                    if (!stophash.Contains(word))
                    {
                        words.Add(word);
                    }
                }
            }

            return(words);
        }
Пример #2
0
        private static List <string> TokenizeTwitter(string content, TokenizeConfig config)
        {
            content = content.ToLower();
            content = RemoveContentNoise.RemoveTweetTokenizeNoise(content);

            var sep      = TweetSeperator;
            var tokens   = content.Split(sep, StringSplitOptions.RemoveEmptyEntries);
            var stophash = Util.GetHashSet(config.StopWords);

            List <string> words = new List <string>();

            foreach (var token in tokens)
            {
                if (token.StartsWith("@"))
                {
                    continue;
                }
                var word = token;
                if (word.StartsWith("#"))
                {
                    word = word.Substring(1);
                }
                if (ENRegex.Match(word).Success&& !ENNumRegex.Match(word).Success)
                {
                    if (!stophash.Contains(word))
                    {
                        words.Add(word);
                    }
                }
            }

            //Trace.WriteLine(content);
            //DiagnosticsOperations.Print(words);

            return(words);
        }