コード例 #1
0
        private static List <string> TokenizeHashtag(string content, TokenizeConfig config)
        {
            content = content.ToLower();
            var tokens   = content.Split(TweetSeperator, StringSplitOptions.RemoveEmptyEntries);
            var stophash = new HashSet <string>();

            foreach (var stopword in config.StopWords)
            {
                if (stopword.StartsWith("#"))
                {
                    stophash.Add(stopword);
                }
            }

            List <string> words = new List <string>();

            foreach (var token in tokens)
            {
                if (token.StartsWith("#"))
                {
                    if (!stophash.Contains(token))
                    {
                        words.Add(token);
                    }
                }
            }

            //Trace.WriteLine(content);
            //DiagnosticsOperations.Print(words);

            return(words);
        }
コード例 #2
0
        private static List <string> TokenizeRetweet(string content, TokenizeConfig config)
        {
            //content = content.ToLower();
            if (content.StartsWith("RT @"))
            {
                int index = content.IndexOf(":");
                if (index >= 0)
                {
                    content = content.Substring(0, index);
                }
            }
            else
            {
                return(new List <string>());
            }


            var tokens = content.Split(TweetSeperator, StringSplitOptions.RemoveEmptyEntries);

            List <string> words = new List <string>();

            foreach (var token in tokens)
            {
                if (token.StartsWith("@"))
                {
                    var word = token.Substring(1);
                    if (!config.StopWords.Contains(word))
                    {
                        words.Add(word);
                    }
                }
            }

            return(words);
        }
コード例 #3
0
        private static List <string> TokenizeTweetUser(string content, TokenizeConfig config)
        {
            content = content.ToLower();
            content = RemoveContentNoise.RemoveTweetTokenizeNoise(content);

            var sep      = TweetSeperator;
            var tokens   = content.Split(sep, StringSplitOptions.RemoveEmptyEntries);
            var stophash = Util.GetHashSet(config.StopWords);

            List <string> words = new List <string>();

            foreach (var token in tokens)
            {
                if (token.StartsWith("@"))
                {
                    var word = token;
                    word = word.Substring(1);

                    if (!stophash.Contains(word))
                    {
                        words.Add(word);
                    }
                }
            }

            return(words);
        }
コード例 #4
0
        private static List <string> TokenizeFeatureVector(string content, TokenizeConfig config)
        {
            var wordDict                = new Dictionary <string, int>();
            var wordDictString          = content;
            var wordDictStringSplitList = wordDictString.Split(new string[] { "\\n" }, StringSplitOptions.RemoveEmptyEntries);

            foreach (string str in wordDictStringSplitList)
            {
                if (!string.IsNullOrWhiteSpace(str))
                {
                    int pos2 = str.IndexOf('(');
                    wordDict[str.Substring(0, pos2)] = (int)int.Parse(str.Substring(pos2 + 1, str.Length - pos2 - 2));
                }
            }
            List <string> words = new List <string>();

            foreach (var kvp in wordDict)
            {
                if (!config.StopWords.Contains(kvp.Key))
                {
                    for (int i = 0; i < kvp.Value; i++)
                    {
                        words.Add(kvp.Key);
                    }
                }
            }
            //wordDict =
            //    wordDict.Where(kvp => !config.StopWords.Contains(kvp.Key))
            //        .ToDictionary(kvp => kvp.Key, kvp => kvp.Value);

            return(words);
        }
コード例 #5
0
        static List <string> TokenizeICTCLAS(string content, TokenizeConfig config)
        {
            if (!IsICTCLASInitialized)
            {
                if (!NLPIR_Init(datapath, 0, ""))//给出Data文件所在的路径,注意根据实际情况修改。
                {
                    throw new Exception("Init ICTCLAS failed!");
                }
                //System.Console.WriteLine("Init ICTCLAS success!");

                IsICTCLASInitialized = true;
            }

            //Add user dictionary
            if (config.UserDict != null && config.UserDict.Count != 0)
            {
                foreach (var kvp in config.UserDict)
                {
                    NLPIR_AddUserWord(kvp.Key + " " + kvp.Value);//词 词性 example:点击下载 vyou
                }
            }

            //Tokenize
            var           intPtr = NLPIR_ParagraphProcess(content.ToLower(), 1);
            var           str    = Marshal.PtrToStringAnsi(intPtr);
            var           tokens = str.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            List <string> words  = new List <string>();

            foreach (var token in tokens)
            {
                var index = token.IndexOf('/');
                if (index > 0)
                {
                    words.Add(token.Substring(0, index));
                }
            }

            //Filter Stopwords
            var words2   = new List <string>();
            var stophash = StopFilter.MakeStopSet(config.StopWords);

            foreach (var word in words)
            {
                if (!stophash.Contains(word) && Regex.Match(word).Success)
                {
                    words2.Add(word);
                }
            }

            return(words2);
        }
コード例 #6
0
        private static List <string> TokenizeSimpleSplit(string content, TokenizeConfig config)
        {
            content = content.ToLower();

            var tokens = content.Split(SimpleSplitSeperator, StringSplitOptions.RemoveEmptyEntries);

            List <string> words = new List <string>();

            foreach (var word in tokens)
            {
                //if (ENRegex.Match(word).Success)
                if (word.Length < 20)
                {
                    words.Add(word);
                }
            }

            return(words);
        }
コード例 #7
0
        static List <string> TokenizeStandard(string content, TokenizeConfig config)
        {
            StringReader reader = new StringReader(content);
            TokenStream  result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_24, reader);

            var stophash = StopFilter.MakeStopSet(config.StopWords);

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(true, result, stophash, true);

            /// Set up lexicon/invertlexicon, featurevectors, wordappearancecount ///
            result.Reset();
            TermAttribute termattr = (TermAttribute)result.GetAttribute(typeof(TermAttribute));
            List <string> words    = new List <string>();

            while (result.IncrementToken())
            {
                words.Add(termattr.Term());
            }
            return(words);
        }
コード例 #8
0
        public static List <string> Tokenize(string content, TokenizeConfig config)
        {
            switch (config.TokenizerType)
            {
            case TokenizerType.Standard:
                return(TokenizeStandard(content, config));

            case TokenizerType.ICTCLAS:
                return(TokenizeICTCLAS(content, config));

            case TokenizerType.Twitter:
                return(TokenizeTwitter(content, config));

            case TokenizerType.TweetUser:
                return(TokenizeTweetUser(content, config));

            case TokenizerType.Hashtag:
                return(TokenizeHashtag(content, config));

            case TokenizerType.Mention:
                return(TokenizeMention(content, config));

            case TokenizerType.Retweet:
                return(TokenizeRetweet(content, config));

            case TokenizerType.ChineseWordBreaker:
                return(TokenizeCWB(content, config));

            case TokenizerType.SimpleSplit:
                return(TokenizeSimpleSplit(content, config));

            case TokenizerType.FeatureVector:
                return(TokenizeFeatureVector(content, config));

            default:
                throw new NotImplementedException();
            }
        }
コード例 #9
0
        private static List <string> TokenizeTwitter(string content, TokenizeConfig config)
        {
            content = content.ToLower();
            content = RemoveContentNoise.RemoveTweetTokenizeNoise(content);

            var sep      = TweetSeperator;
            var tokens   = content.Split(sep, StringSplitOptions.RemoveEmptyEntries);
            var stophash = Util.GetHashSet(config.StopWords);

            List <string> words = new List <string>();

            foreach (var token in tokens)
            {
                if (token.StartsWith("@"))
                {
                    continue;
                }
                var word = token;
                if (word.StartsWith("#"))
                {
                    word = word.Substring(1);
                }
                if (ENRegex.Match(word).Success&& !ENNumRegex.Match(word).Success)
                {
                    if (!stophash.Contains(word))
                    {
                        words.Add(word);
                    }
                }
            }

            //Trace.WriteLine(content);
            //DiagnosticsOperations.Print(words);

            return(words);
        }
コード例 #10
0
        public VectorGenerator(TokenizeConfig tokenizeConfig, Dictionary <string, int> fieldWeightDict = null, Dictionary <string, int> leadingSentencesCnt = null)
        {
            if (fieldWeightDict == null)
            {
                fieldWeightDict = new Dictionary <string, int>()
                {
                    { BingNewsFields.NewsArticleHeadline, 3 },
                    { BingNewsFields.NewsArticleDescription, 1 }
                };
            }

            if (leadingSentencesCnt == null)
            {
                leadingSentencesCnt = new Dictionary <string, int> {
                    { BingNewsFields.NewsArticleDescription, 6 }
                };
            }

            _tokenizeConfig      = tokenizeConfig;
            _fieldWeightDict     = fieldWeightDict;
            _leadingSentencesCnt = leadingSentencesCnt;

            Lexicon = new Dictionary <string, int>();
        }
コード例 #11
0
        private static List <string> TokenizeCWB(string content, TokenizeConfig config)
        {
            if (_chineseWordBreaker == null)
            {
                _chineseWordBreaker = new ChineseWordBreaker(@"Utils\Lib\WordBreaker\");
            }

            //Tokenize
            var words = _chineseWordBreaker.Tokenize(content);

            //Filter Stopwords
            var words2   = new List <string>();
            var stophash = StopFilter.MakeStopSet(config.StopWords);

            foreach (var word in words)
            {
                if (!stophash.Contains(word) && Regex.Match(word).Success)
                {
                    words2.Add(word);
                }
            }

            return(words2);
        }