private static List <string> TokenizeTweetUser(string content, TokenizeConfig config) { content = content.ToLower(); content = RemoveContentNoise.RemoveTweetTokenizeNoise(content); var sep = TweetSeperator; var tokens = content.Split(sep, StringSplitOptions.RemoveEmptyEntries); var stophash = Util.GetHashSet(config.StopWords); List <string> words = new List <string>(); foreach (var token in tokens) { if (token.StartsWith("@")) { var word = token; word = word.Substring(1); if (!stophash.Contains(word)) { words.Add(word); } } } return(words); }
/// <summary> /// Twitter data: from cosmos, each line represents a Tweet. /// Different fields are seperated by '\t'. The schema is the name for each field /// </summary> private void BuildFromTwitterTxt() { string inputpath = TwitterConfigure.InputPath; string outputpath = TwitterConfigure.OutputPath; var schema = TwitterConfigure.TwitterSchema; string bodyField = TwitterConfigure.TwitterBodyField; var indexwriter = LuceneOperations.GetIndexWriter(outputpath); StreamReader sr = new StreamReader(inputpath); string line; int lineCnt = 0; while ((line = sr.ReadLine()) != null) { lineCnt++; } //Console.WriteLine("Total Lines: " + lineCnt); sr.Close(); sr = new StreamReader(inputpath); var seperator = new char[] { '\t' }; int lineIndex = 0; var progress = new ProgramProgress(lineCnt); while ((line = sr.ReadLine()) != null) { //if (lineIndex % 100000 == 0) // Console.WriteLine("{0} out of {1} ({2}%)", lineIndex, lineCnt, 100 * lineIndex / lineCnt); var tokens = line.Split(seperator);//, StringSplitOptions.RemoveEmptyEntries); if (tokens.Length != schema.Length) { throw new Exception("Unmatch schema"); } var document = new Document(); for (int i = 0; i < tokens.Length; i++) { if (schema[i] == bodyField) { tokens[i] = RemoveContentNoise.RemoveTweetIndexNoise(tokens[i]); } document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED)); } indexwriter.AddDocument(document); lineIndex++; progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); sr.Close(); indexwriter.Optimize(); indexwriter.Close(); }
private static List <string> TokenizeTwitter(string content, TokenizeConfig config) { content = content.ToLower(); content = RemoveContentNoise.RemoveTweetTokenizeNoise(content); var sep = TweetSeperator; var tokens = content.Split(sep, StringSplitOptions.RemoveEmptyEntries); var stophash = Util.GetHashSet(config.StopWords); List <string> words = new List <string>(); foreach (var token in tokens) { if (token.StartsWith("@")) { continue; } var word = token; if (word.StartsWith("#")) { word = word.Substring(1); } if (ENRegex.Match(word).Success&& !ENNumRegex.Match(word).Success) { if (!stophash.Contains(word)) { words.Add(word); } } } //Trace.WriteLine(content); //DiagnosticsOperations.Print(words); return(words); }