Exemple #1
0
        private string GetTooltipString()
        {
            var    words = _dataScheme.GetTopicWords(_dataNode);
            string label = "";
            int    index = 0;

            foreach (var word in words.Keys)
            {
                label += word + ", ";
                if (++index == 20)
                {
                    break;
                }
            }
            var keywords = _scheme.GetKeywords();

            if (keywords != null && keywords.Length > 0)
            {
                label += "\n-------------------------\n";
                Dictionary <string, double> keywordsDict = new Dictionary <string, double>();
                foreach (var keyword in keywords)
                {
                    double value;
                    var    keyword2 = keyword.ToLower();
                    if (words.TryGetValue(keyword2, out value))
                    {
                        if (!keywordsDict.ContainsKey(keyword2))
                        {
                            keywordsDict.Add(keyword2, value);
                        }
                    }
                }
                keywordsDict = SortUtils.EnsureSortedByValue(keywordsDict);
                label       += StringOperations.GetMergedString(keywordsDict, ' ');
            }
            return(label);
        }
        public static void AnalyzeSearchWordSentiment(string indexPath, string field, string[] keywords, int printDocumentCnt = 10, string histogramField = null)
        {
            var searcher = LuceneOperations.GetIndexSearcher(indexPath);
            var reader   = searcher.GetIndexReader();
            var docIDs   = LuceneOperations.Search(searcher, StringOperations.GetMergedString(keywords, " "), field);

            Console.WriteLine("Find {0}% ({1}/{2}) documents containing: {3}", (100.0 * docIDs.Count / reader.NumDocs()), docIDs.Count, reader.NumDocs(), StringOperations.GetMergedString(keywords, " "));

            var              progress      = new ProgramProgress(docIDs.Count);
            var              sentiAnalyzer = new SentimentAnalyzer();
            SentimentType    sentimentType;
            double           sentimentScore;
            HeapSortDouble   hsdPos     = new HeapSortDouble(printDocumentCnt);
            HeapSortDouble   hsdNeg     = new HeapSortDouble(printDocumentCnt);
            Counter <string> counterPos = null;
            Counter <string> counterNeg = null;
            Counter <string> counterNeu = null;

            if (histogramField != null)
            {
                counterPos = new Counter <string>();
                counterNeg = new Counter <string>();
                counterNeu = new Counter <string>();
            }
            int posCnt = 0;
            int negCnt = 0;
            int neuCnt = 0;

            foreach (var docID in docIDs)
            {
                var document = reader.Document(docID);
                var content  = document.Get(field);
                sentiAnalyzer.GetSentiment(content, out sentimentType, out sentimentScore);

                switch (sentimentType)
                {
                case SentimentType.Positive:
                    posCnt++;
                    hsdPos.Insert(docID, Math.Abs(sentimentScore));
                    if (histogramField != null)
                    {
                        counterPos.Add(document.Get(histogramField));
                    }
                    break;

                case SentimentType.Negative:
                    negCnt++;
                    hsdNeg.Insert(docID, Math.Abs(sentimentScore));
                    if (histogramField != null)
                    {
                        counterNeg.Add(document.Get(histogramField));
                    }
                    break;

                case SentimentType.Neutral:
                    neuCnt++;
                    if (histogramField != null)
                    {
                        counterNeu.Add(document.Get(histogramField));
                    }
                    break;

                default:
                    throw new NotImplementedException();
                }

                progress.PrintIncrementExperiment();
            }

            Console.WriteLine("Positive document ratio {0}% ({1}/{2})", Math.Round(100.0 * posCnt / docIDs.Count), posCnt, docIDs.Count);
            Console.WriteLine("Negatvie document ratio {0}% ({1}/{2})", Math.Round(100.0 * negCnt / docIDs.Count), negCnt, docIDs.Count);
            Console.WriteLine("Neutral document ratio {0}% ({1}/{2})", Math.Round(100.0 * neuCnt / docIDs.Count), neuCnt, docIDs.Count);

            Console.WriteLine(StringOperations.WrapWithDash("Positive documents"));
            foreach (var kvp in hsdPos.GetSortedDictionary())
            {
                Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field));
            }

            Console.WriteLine(StringOperations.WrapWithDash("Negative documents"));
            foreach (var kvp in hsdNeg.GetSortedDictionary())
            {
                Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field));
            }

            progress.PrintTotalTime();

            if (histogramField != null)
            {
                string[]           featureStrings = new[] { "Pos", "Neg", "Neu" };
                Counter <string>[] counters       = new[] { counterPos, counterNeg, counterNeu };
                for (int i = 0; i < featureStrings.Length; i++)
                {
                    Console.WriteLine(StringOperations.WrapWithDash(histogramField + " " + featureStrings[i]));
                    int index = 0;
                    foreach (var kvp in counters[i].GetCountDictionary().OrderByDescending(kvp => kvp.Value))
                    {
                        Console.WriteLine(kvp.Key + "\t" + kvp.Value);
                        if (++index >= 100)
                        {
                            break;
                        }
                    }
                }
            }

            Console.ReadKey();
        }
        public void Transform(string inputFolder, string indexPath, HashSet <string> keywords)
        {
            Console.WriteLine("Start to search words: " + StringOperations.GetMergedString(keywords));
            Console.WriteLine("InputFolder: " + inputFolder + "\n");

            string notParseSpecString = "Temp-DoNotParse";

            inputFolder = StringOperations.EnsureFolderEnd(inputFolder);

            string[] schema = new[]
            {
                "CreatedAt", "Text", "IsRetweet", "Retweeted", "RetweetCount",
                "UserScreenName", "UserId", "UserFollowersCount", "UserFriendsCount"
            };
            var schemeDict            = Util.GetInvertedDictionary(schema);
            var textFieldIndex        = schemeDict["Text"];
            var createdTimeFieldIndex = schemeDict["CreatedAt"];
            var userIdFieldIndex      = schemeDict["UserId"];

            //string outputPath = inputFolder + notParseSpecString + "\\";
            //if (Directory.Exists(outputPath))
            //{
            //    Directory.Delete(outputPath, true);
            //}
            //Directory.CreateDirectory(outputPath);
            //var indexPath = outputPath + "Index\\";
            if (Directory.Exists(indexPath))
            {
                Directory.Delete(indexPath, true);
            }

            var files = Directory.GetFiles(inputFolder, "*.*", SearchOption.AllDirectories);

            //Preprocess
            Console.WriteLine("Start preprocesing...");
            ProgramProgress progress   = new ProgramProgress(files.Length);
            int             estiDocCnt = 0;

            foreach (var file in files)
            {
                estiDocCnt += FileOperations.GetLineCount(file);
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();
            Console.WriteLine("Estimate tweet count: " + estiDocCnt + "\n");

            //Parse
            Console.WriteLine("Start parsing...");

            var            indexWriter    = LuceneOperations.GetIndexWriter(indexPath);
            TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter);

            progress = new ProgramProgress(estiDocCnt);
            var sep             = new char[] { '\t' };
            int uniqDocFoundCnt = 0;
            int docFoundCnt     = 0;
            int docCnt          = 0;

            ThreeLayerHashSet <string, long, string> hash3Layer = new ThreeLayerHashSet <string, long, string>();
            int notUsedDocCnt = 0;

            foreach (var file in files)
            {
                if (file.Contains(notParseSpecString))
                {
                    continue;
                }

                if (file.EndsWith(".txt"))
                {
                    var    sr = new StreamReader(file);
                    string line;

                    while ((line = sr.ReadLine()) != null)
                    {
                        var tokens = line.Split(sep, StringSplitOptions.None);
                        if (tokens.Length != schema.Length)
                        {
                            notUsedDocCnt++;
                            continue;
                            //throw new ArgumentException();
                        }

                        var  words           = NLPOperations.Tokenize(tokens[textFieldIndex], tokenizeConfig);
                        bool isContainSearch = false;
                        foreach (var word in words)
                        {
                            if (keywords.Contains(word))
                            {
                                isContainSearch = true;
                                break;
                            }
                        }
                        if (isContainSearch)
                        {
                            string createdAt = tokens[createdTimeFieldIndex];
                            long   userId    = long.Parse(tokens[userIdFieldIndex]);
                            string text      = tokens[textFieldIndex];

                            if (!hash3Layer.Contains(createdAt, userId, text))
                            {
                                var document = new Document();
                                for (int i = 0; i < schema.Length; i++)
                                {
                                    document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED));
                                }
                                indexWriter.AddDocument(document);

                                hash3Layer.Add(createdAt, userId, text);

                                uniqDocFoundCnt++;
                            }
                            docFoundCnt++;
                        }
                        docCnt++;
                        progress.PrintIncrementExperiment(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%",
                                                                        uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, (docFoundCnt == 0 ? 0 : (100 * uniqDocFoundCnt / docFoundCnt))));
                    }

                    sr.Close();
                }
            }
            progress.PrintTotalTime();

            Console.WriteLine(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%",
                                            uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, 100 * uniqDocFoundCnt / docFoundCnt));
            Console.WriteLine("Not used doc count: " + notUsedDocCnt);

            Console.WriteLine("Start writing index...");
            indexWriter.Commit();
            indexWriter.Close();

            Console.WriteLine("Finish");
            Console.ReadKey();
        }
        public void Start()
        {
            if (!outputpath.EndsWith("\\"))
            {
                outputpath += "\\";
            }

            var tokenizerConfig = new TokenizeConfig(tokenizeConfigStr);

            var searcher    = LuceneOperations.GetIndexSearcher(inputpath);
            var max_doc_num = (int)(searchDocRatio * searcher.GetIndexReader().NumDocs());
            var scoredDocs  = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num);

            int  iter      = 0;
            bool bContinue = threshold == 0 ? false : true;

            while (bContinue && iter < 5)
            {
                iter++;
                Console.WriteLine("iteration------------------" + iter);
                List <string> keywordsNew;
                #region Calculate Keywords
                var counter = new Counter <string>();
                foreach (var scoredDoc in scoredDocs)
                {
                    var doc     = searcher.Doc(scoredDoc.doc);
                    var content = doc.Get(searchfield);
                    foreach (var word in NLPOperations.Tokenize(content, tokenizerConfig))
                    {
                        counter.Add(word);
                    }
                }
                keywordsNew = counter.GetMostFreqObjs(keywordNum);
                #endregion

                var scoredDocsNew = LuceneOperations.Search(searcher, searchfield, keywordsNew, max_doc_num);
                #region Test whether exit
                int repeatNum = 0;
                var docIDs    = new HashSet <int>();
                foreach (var scoredDoc in scoredDocs)
                {
                    docIDs.Add(scoredDoc.doc);
                }

                foreach (var scoredDocNew in scoredDocsNew)
                {
                    if (docIDs.Contains(scoredDocNew.doc))
                    {
                        repeatNum++;
                    }
                }

                bContinue = (double)repeatNum / scoredDocs.Length < threshold;
                #endregion

                Console.WriteLine(repeatNum + "  " + scoredDocsNew.Length);

                keywords   = keywordsNew;
                scoredDocs = scoredDocsNew;

                Console.WriteLine(StringOperations.GetMergedString(keywords));
            }

            max_doc_num = (int)(saveDocRatio * searcher.GetIndexReader().NumDocs());
            scoredDocs  = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num);
            var writer = LuceneOperations.GetIndexWriter(outputpath);
            foreach (var scoredDoc in scoredDocs)
            {
                Document doc = searcher.Doc(scoredDoc.doc);
                writer.AddDocument(doc);
            }
            writer.Optimize();
            writer.Close();

            if (isPrintRemovedDocuments)
            {
                var sw             = new StreamWriter(outputpath + "removeDocuments.txt");
                var selectedDocIDs = new HashSet <int>();
                foreach (var scoredDoc in scoredDocs)
                {
                    selectedDocIDs.Add(scoredDoc.doc);
                }

                var reader = searcher.GetIndexReader();
                for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
                {
                    if (!selectedDocIDs.Contains(iDoc))
                    {
                        sw.WriteLine(LuceneOperations.GetDocumentString(reader.Document(iDoc)));
                    }
                }
                reader.Close();
                sw.Flush();
                sw.Close();
            }

            searcher.Close();

            Console.WriteLine("Done");
            Console.ReadKey();
        }
Exemple #5
0
        public void TransformWithFileNames(string[] files, string indexPath, HashSet <string> searchHashSet, SearchSpinn3rType searchType)
        {
            double         tweetCnt       = 0;
            TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter);
            var            indexWriter    = LuceneOperations.GetIndexWriter(indexPath);

            var progress      = new ProgramProgress(files.Length);
            int docFoundCount = 0;
            int totalDocCount = 0;

            foreach (var file in files)
            {
                FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) =>
                {
                    tweetCnt += data.count;
                    //Console.WriteLine(data.count);
                    //Console.WriteLine(data.items[0].main);
                    foreach (var tweet in data.items)
                    {
                        if (tweet.lang != "en")
                        {
                            continue;
                        }

                        bool isContainSearch = false;
                        switch (searchType)
                        {
                        case SearchSpinn3rType.Main:
                            var words = NLPOperations.Tokenize(tweet.main, tokenizeConfig);
                            foreach (var word in words)
                            {
                                if (searchHashSet.Contains(word))
                                {
                                    isContainSearch = true;
                                    break;
                                }
                            }
                            break;

                        case SearchSpinn3rType.User:
                            isContainSearch = searchHashSet.Contains(tweet.author_link.ToLower());
                            break;

                        default:
                            throw new ArgumentException();
                        }

                        if (isContainSearch)
                        {
                            var document = new Document();
                            document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            indexWriter.AddDocument(document);
                            docFoundCount++;
                        }
                        totalDocCount++;
                    }
                });
                progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount));
            }
            progress.PrintTotalTime();

            Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount);

            Console.WriteLine("Start writing index...");
            indexWriter.Commit();
            indexWriter.Close();

            Util.ProgramFinishHalt();
        }
Exemple #6
0
        public void TransformWithFileNameContentSearch(string[] files, string indexPath, string searchStr, string progressEndStr = null)
        {
            double tweetCnt    = 0;
            var    indexWriter = LuceneOperations.GetIndexWriter(indexPath);

            searchStr = searchStr.ToLower();

            var progress      = new ProgramProgress(files.Length);
            int docFoundCount = 0;
            int totalDocCount = 0;

            foreach (var file in files)
            {
                FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) =>
                {
                    tweetCnt += data.count;
                    //Console.WriteLine(data.count);
                    //Console.WriteLine(data.items[0].main);
                    foreach (var tweet in data.items)
                    {
                        if (tweet.lang != "en")
                        {
                            continue;
                        }

                        if (tweet.main.ToLower().Contains(searchStr))
                        {
                            var document = new Document();
                            document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            indexWriter.AddDocument(document);
                            docFoundCount++;
                        }
                        totalDocCount++;
                    }
                });
                progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%) -- {3}", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount, progressEndStr));
            }
            progress.PrintTotalTime();

            Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount);

            Console.WriteLine("Start writing index...");
            indexWriter.Commit();
            indexWriter.Close();

            //Util.ProgramFinishHalt();
        }