private static List <string> TokenizeHashtag(string content, TokenizeConfig config) { content = content.ToLower(); var tokens = content.Split(TweetSeperator, StringSplitOptions.RemoveEmptyEntries); var stophash = new HashSet <string>(); foreach (var stopword in config.StopWords) { if (stopword.StartsWith("#")) { stophash.Add(stopword); } } List <string> words = new List <string>(); foreach (var token in tokens) { if (token.StartsWith("#")) { if (!stophash.Contains(token)) { words.Add(token); } } } //Trace.WriteLine(content); //DiagnosticsOperations.Print(words); return(words); }
private static List <string> TokenizeRetweet(string content, TokenizeConfig config) { //content = content.ToLower(); if (content.StartsWith("RT @")) { int index = content.IndexOf(":"); if (index >= 0) { content = content.Substring(0, index); } } else { return(new List <string>()); } var tokens = content.Split(TweetSeperator, StringSplitOptions.RemoveEmptyEntries); List <string> words = new List <string>(); foreach (var token in tokens) { if (token.StartsWith("@")) { var word = token.Substring(1); if (!config.StopWords.Contains(word)) { words.Add(word); } } } return(words); }
private static List <string> TokenizeTweetUser(string content, TokenizeConfig config) { content = content.ToLower(); content = RemoveContentNoise.RemoveTweetTokenizeNoise(content); var sep = TweetSeperator; var tokens = content.Split(sep, StringSplitOptions.RemoveEmptyEntries); var stophash = Util.GetHashSet(config.StopWords); List <string> words = new List <string>(); foreach (var token in tokens) { if (token.StartsWith("@")) { var word = token; word = word.Substring(1); if (!stophash.Contains(word)) { words.Add(word); } } } return(words); }
private static List <string> TokenizeFeatureVector(string content, TokenizeConfig config) { var wordDict = new Dictionary <string, int>(); var wordDictString = content; var wordDictStringSplitList = wordDictString.Split(new string[] { "\\n" }, StringSplitOptions.RemoveEmptyEntries); foreach (string str in wordDictStringSplitList) { if (!string.IsNullOrWhiteSpace(str)) { int pos2 = str.IndexOf('('); wordDict[str.Substring(0, pos2)] = (int)int.Parse(str.Substring(pos2 + 1, str.Length - pos2 - 2)); } } List <string> words = new List <string>(); foreach (var kvp in wordDict) { if (!config.StopWords.Contains(kvp.Key)) { for (int i = 0; i < kvp.Value; i++) { words.Add(kvp.Key); } } } //wordDict = // wordDict.Where(kvp => !config.StopWords.Contains(kvp.Key)) // .ToDictionary(kvp => kvp.Key, kvp => kvp.Value); return(words); }
static List <string> TokenizeICTCLAS(string content, TokenizeConfig config) { if (!IsICTCLASInitialized) { if (!NLPIR_Init(datapath, 0, ""))//给出Data文件所在的路径,注意根据实际情况修改。 { throw new Exception("Init ICTCLAS failed!"); } //System.Console.WriteLine("Init ICTCLAS success!"); IsICTCLASInitialized = true; } //Add user dictionary if (config.UserDict != null && config.UserDict.Count != 0) { foreach (var kvp in config.UserDict) { NLPIR_AddUserWord(kvp.Key + " " + kvp.Value);//词 词性 example:点击下载 vyou } } //Tokenize var intPtr = NLPIR_ParagraphProcess(content.ToLower(), 1); var str = Marshal.PtrToStringAnsi(intPtr); var tokens = str.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); List <string> words = new List <string>(); foreach (var token in tokens) { var index = token.IndexOf('/'); if (index > 0) { words.Add(token.Substring(0, index)); } } //Filter Stopwords var words2 = new List <string>(); var stophash = StopFilter.MakeStopSet(config.StopWords); foreach (var word in words) { if (!stophash.Contains(word) && Regex.Match(word).Success) { words2.Add(word); } } return(words2); }
private static List <string> TokenizeSimpleSplit(string content, TokenizeConfig config) { content = content.ToLower(); var tokens = content.Split(SimpleSplitSeperator, StringSplitOptions.RemoveEmptyEntries); List <string> words = new List <string>(); foreach (var word in tokens) { //if (ENRegex.Match(word).Success) if (word.Length < 20) { words.Add(word); } } return(words); }
static List <string> TokenizeStandard(string content, TokenizeConfig config) { StringReader reader = new StringReader(content); TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_24, reader); var stophash = StopFilter.MakeStopSet(config.StopWords); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true, result, stophash, true); /// Set up lexicon/invertlexicon, featurevectors, wordappearancecount /// result.Reset(); TermAttribute termattr = (TermAttribute)result.GetAttribute(typeof(TermAttribute)); List <string> words = new List <string>(); while (result.IncrementToken()) { words.Add(termattr.Term()); } return(words); }
public static List <string> Tokenize(string content, TokenizeConfig config) { switch (config.TokenizerType) { case TokenizerType.Standard: return(TokenizeStandard(content, config)); case TokenizerType.ICTCLAS: return(TokenizeICTCLAS(content, config)); case TokenizerType.Twitter: return(TokenizeTwitter(content, config)); case TokenizerType.TweetUser: return(TokenizeTweetUser(content, config)); case TokenizerType.Hashtag: return(TokenizeHashtag(content, config)); case TokenizerType.Mention: return(TokenizeMention(content, config)); case TokenizerType.Retweet: return(TokenizeRetweet(content, config)); case TokenizerType.ChineseWordBreaker: return(TokenizeCWB(content, config)); case TokenizerType.SimpleSplit: return(TokenizeSimpleSplit(content, config)); case TokenizerType.FeatureVector: return(TokenizeFeatureVector(content, config)); default: throw new NotImplementedException(); } }
private static List <string> TokenizeTwitter(string content, TokenizeConfig config) { content = content.ToLower(); content = RemoveContentNoise.RemoveTweetTokenizeNoise(content); var sep = TweetSeperator; var tokens = content.Split(sep, StringSplitOptions.RemoveEmptyEntries); var stophash = Util.GetHashSet(config.StopWords); List <string> words = new List <string>(); foreach (var token in tokens) { if (token.StartsWith("@")) { continue; } var word = token; if (word.StartsWith("#")) { word = word.Substring(1); } if (ENRegex.Match(word).Success&& !ENNumRegex.Match(word).Success) { if (!stophash.Contains(word)) { words.Add(word); } } } //Trace.WriteLine(content); //DiagnosticsOperations.Print(words); return(words); }
public VectorGenerator(TokenizeConfig tokenizeConfig, Dictionary <string, int> fieldWeightDict = null, Dictionary <string, int> leadingSentencesCnt = null) { if (fieldWeightDict == null) { fieldWeightDict = new Dictionary <string, int>() { { BingNewsFields.NewsArticleHeadline, 3 }, { BingNewsFields.NewsArticleDescription, 1 } }; } if (leadingSentencesCnt == null) { leadingSentencesCnt = new Dictionary <string, int> { { BingNewsFields.NewsArticleDescription, 6 } }; } _tokenizeConfig = tokenizeConfig; _fieldWeightDict = fieldWeightDict; _leadingSentencesCnt = leadingSentencesCnt; Lexicon = new Dictionary <string, int>(); }
private static List <string> TokenizeCWB(string content, TokenizeConfig config) { if (_chineseWordBreaker == null) { _chineseWordBreaker = new ChineseWordBreaker(@"Utils\Lib\WordBreaker\"); } //Tokenize var words = _chineseWordBreaker.Tokenize(content); //Filter Stopwords var words2 = new List <string>(); var stophash = StopFilter.MakeStopSet(config.StopWords); foreach (var word in words) { if (!stophash.Contains(word) && Regex.Match(word).Success) { words2.Add(word); } } return(words2); }