Exemple #1
0
        private static List <string> TokenizeTweetUser(string content, TokenizeConfig config)
        {
            content = content.ToLower();
            content = RemoveContentNoise.RemoveTweetTokenizeNoise(content);

            var sep      = TweetSeperator;
            var tokens   = content.Split(sep, StringSplitOptions.RemoveEmptyEntries);
            var stophash = Util.GetHashSet(config.StopWords);

            List <string> words = new List <string>();

            foreach (var token in tokens)
            {
                if (token.StartsWith("@"))
                {
                    var word = token;
                    word = word.Substring(1);

                    if (!stophash.Contains(word))
                    {
                        words.Add(word);
                    }
                }
            }

            return(words);
        }
        /// <summary>
        /// Twitter data: from cosmos, each line represents a Tweet.
        /// Different fields are seperated by '\t'. The schema is the name for each field
        /// </summary>
        private void BuildFromTwitterTxt()
        {
            string inputpath  = TwitterConfigure.InputPath;
            string outputpath = TwitterConfigure.OutputPath;
            var    schema     = TwitterConfigure.TwitterSchema;
            string bodyField  = TwitterConfigure.TwitterBodyField;

            var indexwriter = LuceneOperations.GetIndexWriter(outputpath);

            StreamReader sr = new StreamReader(inputpath);
            string       line;
            int          lineCnt = 0;

            while ((line = sr.ReadLine()) != null)
            {
                lineCnt++;
            }
            //Console.WriteLine("Total Lines: " + lineCnt);
            sr.Close();

            sr = new StreamReader(inputpath);
            var seperator = new char[] { '\t' };
            int lineIndex = 0;
            var progress  = new ProgramProgress(lineCnt);

            while ((line = sr.ReadLine()) != null)
            {
                //if (lineIndex % 100000 == 0)
                //    Console.WriteLine("{0} out of {1} ({2}%)", lineIndex, lineCnt, 100 * lineIndex / lineCnt);

                var tokens = line.Split(seperator);//, StringSplitOptions.RemoveEmptyEntries);
                if (tokens.Length != schema.Length)
                {
                    throw new Exception("Unmatch schema");
                }
                var document = new Document();
                for (int i = 0; i < tokens.Length; i++)
                {
                    if (schema[i] == bodyField)
                    {
                        tokens[i] = RemoveContentNoise.RemoveTweetIndexNoise(tokens[i]);
                    }
                    document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED));
                }
                indexwriter.AddDocument(document);

                lineIndex++;
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            sr.Close();

            indexwriter.Optimize();
            indexwriter.Close();
        }
Exemple #3
0
        private static List <string> TokenizeTwitter(string content, TokenizeConfig config)
        {
            content = content.ToLower();
            content = RemoveContentNoise.RemoveTweetTokenizeNoise(content);

            var sep      = TweetSeperator;
            var tokens   = content.Split(sep, StringSplitOptions.RemoveEmptyEntries);
            var stophash = Util.GetHashSet(config.StopWords);

            List <string> words = new List <string>();

            foreach (var token in tokens)
            {
                if (token.StartsWith("@"))
                {
                    continue;
                }
                var word = token;
                if (word.StartsWith("#"))
                {
                    word = word.Substring(1);
                }
                if (ENRegex.Match(word).Success&& !ENNumRegex.Match(word).Success)
                {
                    if (!stophash.Contains(word))
                    {
                        words.Add(word);
                    }
                }
            }

            //Trace.WriteLine(content);
            //DiagnosticsOperations.Print(words);

            return(words);
        }