Пример #1
0
        public void Start()
        {
            var writer = LuceneOperations.GetIndexWriter(OutputPath);

            var totalDocCnt = 0;

            foreach (var inputPath in InputPaths)
            {
                var reader = LuceneOperations.GetIndexReader(inputPath);
                totalDocCnt += reader.NumDocs();
                reader.Close();
            }

            var progress = new ProgramProgress(totalDocCnt);

            foreach (var inputPath in InputPaths)
            {
                var reader = LuceneOperations.GetIndexReader(inputPath);
                for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
                {
                    writer.AddDocument(reader.Document(iDoc));
                    progress.PrintIncrementExperiment();
                }
                reader.Close();
            }

            writer.Optimize();
            writer.Close();
        }
Пример #2
0
        public void StartTransformTweetIndexForStreamingRoseRiver()
        {
            string inputPath  = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01\";
            string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01_MOD\";

            var indexReader = LuceneOperations.GetIndexReader(inputPath);
            var indexWriter = LuceneOperations.GetIndexWriter(outputPath);

            string          docIDField = BingNewsFields.DocId;
            string          urlField   = BingNewsFields.DocumentURL;
            ProgramProgress progress   = new ProgramProgress(indexReader.NumDocs());

            for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++)
            {
                Document inDoc  = indexReader.Document(iDoc);
                Document outDoc = inDoc;

                outDoc.RemoveField(docIDField);
                outDoc.Add(new Field(docIDField, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED));

                outDoc.RemoveField(urlField);
                outDoc.Add(new Field(urlField, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED));

                indexWriter.AddDocument(inDoc);
                progress.PrintIncrementExperiment();
            }


            indexWriter.Optimize();
            indexWriter.Close();

            indexReader.Close();
        }
Пример #3
0
        public static void AnalyzeLanguageDistribution(string inputPath)
        {
            Counter <string> counter  = new Counter <string>();
            var             filenames = Directory.GetFiles(inputPath, "*.*", System.IO.SearchOption.AllDirectories);
            ProgramProgress progress  = new ProgramProgress(filenames.Length, PrintType.Console);

            foreach (var filename in filenames)
            {
                ZipFile            zipfile = null;
                List <XmlDocument> xmldocs = new List <XmlDocument>();
                if (filename.EndsWith(".zip"))
                {
                    zipfile = new ZipFile(filename);
                    MemoryStream ms = new MemoryStream();
                    foreach (ZipEntry entry in zipfile.Entries)
                    {
                        entry.Extract(ms);
                        ms.Position = 0;
                        XmlDocument xmldoc = new XmlDocument();
                        xmldoc.Load(ms);
                        xmldocs.Add(xmldoc);
                        ms.Dispose();
                    }
                }
                else
                {
                    try
                    {
                        XmlDocument xmldoc = new XmlDocument();
                        xmldoc.Load(filename);
                        xmldocs.Add(xmldoc);
                    }
                    catch
                    {
                        var xmldoclist = DataProcess.DataTransform.BuildLuceneIndex.GetXMLDocList(filename);
                        xmldocs.AddRange(xmldoclist);
                    }
                }
                foreach (XmlDocument xmldoc in xmldocs)
                {
                    XmlNodeList list = xmldoc.GetElementsByTagName("NewsArticleDescription");
                    foreach (XmlNode bodynemapnode in list)
                    {
                        XmlNode newsnode     = bodynemapnode.ParentNode;
                        XmlNode languagenode = newsnode.SelectSingleNode("Language");
                        counter.Add(languagenode.InnerText);
                    }
                    /// Delete temp file
                    //File.Delete(extractpath + entry.FileName);
                }
                progress.PrintIncrementExperiment();
            }

            foreach (var kvp in counter.GetCountDictionary())
            {
                Console.WriteLine(kvp.Key + "\t" + kvp.Value);
            }
        }
Пример #4
0
        /// <summary>
        /// Twitter data: from cosmos, each line represents a Tweet.
        /// Different fields are seperated by '\t'. The schema is the name for each field
        /// </summary>
        private void BuildFromTwitterTxt()
        {
            string inputpath  = TwitterConfigure.InputPath;
            string outputpath = TwitterConfigure.OutputPath;
            var    schema     = TwitterConfigure.TwitterSchema;
            string bodyField  = TwitterConfigure.TwitterBodyField;

            var indexwriter = LuceneOperations.GetIndexWriter(outputpath);

            StreamReader sr = new StreamReader(inputpath);
            string       line;
            int          lineCnt = 0;

            while ((line = sr.ReadLine()) != null)
            {
                lineCnt++;
            }
            //Console.WriteLine("Total Lines: " + lineCnt);
            sr.Close();

            sr = new StreamReader(inputpath);
            var seperator = new char[] { '\t' };
            int lineIndex = 0;
            var progress  = new ProgramProgress(lineCnt);

            while ((line = sr.ReadLine()) != null)
            {
                //if (lineIndex % 100000 == 0)
                //    Console.WriteLine("{0} out of {1} ({2}%)", lineIndex, lineCnt, 100 * lineIndex / lineCnt);

                var tokens = line.Split(seperator);//, StringSplitOptions.RemoveEmptyEntries);
                if (tokens.Length != schema.Length)
                {
                    throw new Exception("Unmatch schema");
                }
                var document = new Document();
                for (int i = 0; i < tokens.Length; i++)
                {
                    if (schema[i] == bodyField)
                    {
                        tokens[i] = RemoveContentNoise.RemoveTweetIndexNoise(tokens[i]);
                    }
                    document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED));
                }
                indexwriter.AddDocument(document);

                lineIndex++;
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            sr.Close();

            indexwriter.Optimize();
            indexwriter.Close();
        }
Пример #5
0
        public void TransformWithFileNameContentSearch(string[] files, string indexPath, string searchStr, string progressEndStr = null)
        {
            double tweetCnt    = 0;
            var    indexWriter = LuceneOperations.GetIndexWriter(indexPath);

            searchStr = searchStr.ToLower();

            var progress      = new ProgramProgress(files.Length);
            int docFoundCount = 0;
            int totalDocCount = 0;

            foreach (var file in files)
            {
                FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) =>
                {
                    tweetCnt += data.count;
                    //Console.WriteLine(data.count);
                    //Console.WriteLine(data.items[0].main);
                    foreach (var tweet in data.items)
                    {
                        if (tweet.lang != "en")
                        {
                            continue;
                        }

                        if (tweet.main.ToLower().Contains(searchStr))
                        {
                            var document = new Document();
                            document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            indexWriter.AddDocument(document);
                            docFoundCount++;
                        }
                        totalDocCount++;
                    }
                });
                progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%) -- {3}", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount, progressEndStr));
            }
            progress.PrintTotalTime();

            Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount);

            Console.WriteLine("Start writing index...");
            indexWriter.Commit();
            indexWriter.Close();

            //Util.ProgramFinishHalt();
        }
Пример #6
0
        public void AnalyzeDocuments()
        {
            string fileName  = @"D:\Project\TopicPanorama\data\TopicGraphs\NewCode-Ebola-Test2\Raw\news\result\lda.top.json";
            string indexPath = @"D:\DataProcess\Index\Raw_EbolaEnBingNews_Ebola_0_1_RS_R-1";

            int topDocCnt = 20;

            var indexReader = LuceneOperations.GetIndexReader(indexPath);

            //Read from json and sort
            SimpleJsonReader reader = new SimpleJsonReader(new StreamReader(File.Open(fileName, FileMode.Open)));

            HeapSortDouble[] hsd        = null;
            int             topicNumber = -1;
            ProgramProgress progress    = new ProgramProgress(indexReader.NumDocs());

            while (reader.IsReadable)
            {
                int      docID      = int.Parse(reader.ReadPropertyName());
                double[] topicArray = reader.ReadDoubleArray();

                if (topicNumber < 0)
                {
                    topicNumber = topicArray.Length;
                    hsd         = new HeapSortDouble[topicNumber];
                    for (int i = 0; i < topicNumber; i++)
                    {
                        hsd[i] = new HeapSortDouble(topDocCnt);
                    }
                }

                for (int i = 0; i < topicNumber; i++)
                {
                    hsd[i].Insert(docID, topicArray[i]);
                }
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            //Statistics


            Console.ReadLine();
        }
Пример #7
0
        private List <int> RemoveSameURLDocument(IndexReader reader, List <int> orgDocIDs)
        {
            var newDocIDs = new List <int>();

            var docNum = orgDocIDs.Count;
            HashSet <string> urlHash = new HashSet <string>();

            Console.WriteLine("Total {0} docs", docNum);

            int    removeDocNum = 0;
            string urlfield     = Configure.URLField;
            var    progress     = new ProgramProgress(docNum);

            foreach (var iDoc in orgDocIDs)
            {
                var    document = reader.Document(iDoc);
                string url      = document.Get(urlfield);
                if (url != null)
                {
                    url = url.ToLower();
                    if (!urlHash.Contains(url))
                    {
                        newDocIDs.Add(iDoc);
                        urlHash.Add(url);
                    }
                    else
                    {
                        removeDocNum++;
                    }
                }

                progress.PrintIncrementExperiment();
            }
            Console.WriteLine("Finished remove same URL. Removed {0} out of {1}", removeDocNum, docNum);

            return(newDocIDs);
        }
Пример #8
0
        public void Start()
        {
            var             reader   = LuceneOperations.GetIndexReader(Configure.InputPath);
            var             docNum   = reader.NumDocs();
            ProgramProgress progress = new ProgramProgress(docNum);

            XmlDoc[] xmlDocs = new XmlDoc[docNum];
            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                var doc = reader.Document(iDoc);
                xmlDocs[iDoc] = new XmlDoc(doc);
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            //序列化这个对象
            XmlSerializer serializer = new XmlSerializer(typeof(XmlDocCollection));

            ////将对象序列化输出到控制台
            serializer.Serialize(new StreamWriter(Configure.OutputPath), new XmlDocCollection()
            {
                XmlDocs = xmlDocs
            });
        }
Пример #9
0
        private void BuildLuceneFromFile(string filename, List <string[]> keywordLists, List <IndexWriter> indexwriters, List <string> languages, string[] selectedFields,
                                         int[] newsfoundcnts, List <StreamWriter> infofiles, ProgramProgress progress)
        {
            //string filename = filenames[ifilename];
            int deltanewsfoundcnt = 0;

            ZipFile            zipfile = null;
            List <XmlDocument> xmldocs = new List <XmlDocument>();

            if (filename.EndsWith(".zip"))
            {
                zipfile = new ZipFile(filename);
                MemoryStream ms = new MemoryStream();
                foreach (ZipEntry entry in zipfile.Entries)
                {
                    entry.Extract(ms);
                    ms.Position = 0;
                    XmlDocument xmldoc = new XmlDocument();
                    xmldoc.Load(ms);
                    xmldocs.Add(xmldoc);
                    ms.Dispose();
                }
            }
            else
            {
                try
                {
                    XmlDocument xmldoc = new XmlDocument();
                    xmldoc.Load(filename);
                    xmldocs.Add(xmldoc);
                }
                catch
                {
                    var xmldoclist = GetXMLDocList(filename);
                    xmldocs.AddRange(xmldoclist);
                }
            }
            foreach (XmlDocument xmldoc in xmldocs)
            {
                XmlNodeList list = xmldoc.GetElementsByTagName("NewsArticleDescription");
                foreach (XmlNode bodynemapnode in list)
                {
                    for (int ikeyword = 0; ikeyword < keywordLists.Count; Interlocked.Increment(ref ikeyword))
                    {
                        var         keywords    = keywordLists[ikeyword];
                        IndexWriter indexwriter = indexwriters[ikeyword];
                        string      str         = bodynemapnode.InnerText;
                        bool        bStore      = false;
                        foreach (var keyword in keywords)
                        {
                            if (str.Contains(keyword))
                            {
                                bStore = true;
                                break;
                            }
                        }

                        if (bStore)
                        {
                            XmlNode newsnode     = bodynemapnode.ParentNode;
                            XmlNode languagenode = newsnode.SelectSingleNode("Language");
                            //Test whether it is written in english
                            if (!languages.Contains(languagenode.InnerText))
                            {
                                continue;
                            }

                            /// Unique Document ///
                            //Extract all useful fields
                            string   docid    = newsnode.Attributes[0].Value;
                            Document document = new Document();
                            document.Add(new Field("DocId", docid, Field.Store.YES, Field.Index.ANALYZED));
                            foreach (string fieldname in selectedFields)
                            {
                                XmlNode node = newsnode.SelectSingleNode(fieldname);
                                if (node != null)
                                {
                                    string luceneFieldName = fieldname;
                                    if (luceneFieldName == "DocumentUrl")
                                    {
                                        luceneFieldName = "DocumentURL";
                                    }
                                    document.Add(new Field(luceneFieldName, node.InnerText, Field.Store.YES, Field.Index.ANALYZED));
                                }
                            }

                            indexwriter.AddDocument(document);
                            Interlocked.Increment(ref newsfoundcnts[ikeyword]);
                            deltanewsfoundcnt++;
                        }
                    }
                }

                /// Delete temp file
                //File.Delete(extractpath + entry.FileName);
            }

            for (int ikeyword = 0; ikeyword < keywordLists.Count; ikeyword++)
            {
                infofiles[ikeyword].WriteLine(filename + "\t\t" + deltanewsfoundcnt + "\t\t" + newsfoundcnts[ikeyword]);
                infofiles[ikeyword].Flush();
            }
            ;

            progress.PrintIncrementExperiment();
        }
Пример #10
0
        public void StartKDD()
        {
            // -- node counts --
            string     folder     = @"D:\Project\StreamingRoseRiver\EbolaCaseStudyFinal\RoseRiver\Data\KddInfovisGraphicsIndex_Lucene_a=0.003_sm=1\";
            string     exeFolder  = @"D:\Project\StreamingRoseRiver\EbolaCaseStudyFinal\RoseRiver\RoseRiver\bin\x64\Release\";
            List <int> nodeCounts = new List <int>();

            for (int i = 0; i < 11; i++)
            {
                var fileName = folder + i + ".gv";
                var tree     = BRTAnalysis.ReadTree(fileName);
                nodeCounts.Add(tree.BFS(tree.Root).Count());
            }

            // -- experiment --
            var copyFactors      = new[] { 2, 1 };
            var focusCounts      = DataProcess.Utils.Util.GetIntArray(1, 5);
            var focusSampleCount = 5;
            var minMaxTreeCount  = 6;
            var maxMaxTreeCount  = 8;
            int index            = 0;

            ProgramProgress progress  = new ProgramProgress(copyFactors.Length * focusCounts.Length * focusSampleCount * (maxMaxTreeCount - minMaxTreeCount + 1));
            var             configure = new TopicStreamConfigure();

            configure.DataType = "kdd";
            foreach (var copyFactor in copyFactors)
            {
                configure.CopyFactor = copyFactor;
                foreach (var focusCount in focusCounts)
                {
                    for (int iFocusSample = 0; iFocusSample < focusSampleCount; iFocusSample++)
                    {
                        configure.FocusCount               = focusCount;
                        configure.DefaultTreeCut           = GetRandomManuallyTreeCut(focusCount, minMaxTreeCount, iFocusSample, nodeCounts, 1);
                        configure.DefaultTreeCutRandomSeed = iFocusSample;
                        for (int iMaxTreeCount = minMaxTreeCount; iMaxTreeCount <= maxMaxTreeCount; iMaxTreeCount++)
                        {
                            configure.TreeCount = iMaxTreeCount;
                            configure.Index     = index;
                            configure.Write();

                            File.Copy(TopicStreamConfigure.ConfigureFileName, exeFolder + TopicStreamConfigure.ConfigureFileName, true);

                            ProcessStartInfo startInfo = new ProcessStartInfo();
                            startInfo.CreateNoWindow  = true;
                            startInfo.UseShellExecute = false;
                            startInfo.FileName        = exeFolder + @"RoseRiver.exe";
                            startInfo.WindowStyle     = ProcessWindowStyle.Hidden;

                            using (Process exeProcess = Process.Start(startInfo))
                            {
                                exeProcess.WaitForExit();
                            }

                            progress.PrintIncrementExperiment("\n");
                            index++;
                        }
                    }
                }
            }

            progress.PrintTotalTime();
        }
Пример #11
0
        public static void AnalyzeTwitterWordDistribution(string inputPath, TokenizeConfig tokenConfig)
        {
            var indexReader = LuceneOperations.GetIndexReader(inputPath);
            var docNum      = indexReader.NumDocs();

            int[] docWordCnt     = new int[docNum];
            int[] docUniqWordCnt = new int[docNum];
            Dictionary <string, int> wordDocCntDict = new Dictionary <string, int>();
            Dictionary <string, int> wordOccCntDict = new Dictionary <string, int>();

            var fieldWeights = tokenConfig.TokenizerType == TokenizerType.FeatureVector
                ? BingNewsFields.FeatureVectorFieldWeights
                : BingNewsFields.NewsFieldWeights;

            ProgramProgress progress = new ProgramProgress(docNum);

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                var document = indexReader.Document(iDoc);
                var content  = LuceneOperations.GetContent(document, fieldWeights);

                var words       = NLPOperations.Tokenize(content, tokenConfig);
                var uniqueWords = new HashSet <string>(words);
                docWordCnt[iDoc]     = words.Count;
                docUniqWordCnt[iDoc] = uniqueWords.Count;

                foreach (var word in uniqueWords)
                {
                    if (!wordDocCntDict.ContainsKey(word))
                    {
                        wordDocCntDict.Add(word, 0);
                    }
                    wordDocCntDict[word]++;
                }

                foreach (var word in words)
                {
                    if (!wordOccCntDict.ContainsKey(word))
                    {
                        wordOccCntDict.Add(word, 0);
                    }
                    wordOccCntDict[word]++;
                }

                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            indexReader.Close();

            //Statistics
            DoubleStatistics statDocWordCnt     = new DoubleStatistics();
            DoubleStatistics statDocUniqWordCnt = new DoubleStatistics();
            DoubleStatistics statWordDocCnt     = new DoubleStatistics();
            DoubleStatistics statWordOccCnt     = new DoubleStatistics();

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                statDocWordCnt.AddNumber(docWordCnt[iDoc]);
                statDocUniqWordCnt.AddNumber(docUniqWordCnt[iDoc]);
            }

            foreach (var kvp in wordDocCntDict)
            {
                statWordDocCnt.AddNumber(kvp.Value);
            }

            foreach (var kvp in wordOccCntDict)
            {
                statWordOccCnt.AddNumber(kvp.Value);
            }


            Console.WriteLine(statDocWordCnt.ToString("statDocWordCnt"));
            Console.WriteLine(statDocUniqWordCnt.ToString("statDocUniqWordCnt"));
            Console.WriteLine(statWordDocCnt.ToString("statWordDocCnt"));
            Console.WriteLine(statWordOccCnt.ToString("wordOccCnt"));

            //Hist
            var docWordCntHist       = new DoubleHistogram(docWordCnt.Select(i => (double)i), (double)1);
            var docUniqueWordCntList = new DoubleHistogram(docUniqWordCnt.Select(i => (double)i), (double)1);
            var wordDocCntHist       = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), 1000);
            var wordDocCntHist2      = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), (double)1);

            docWordCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docWordCntHist.csv");
            docUniqueWordCntList.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docUniqueWordCntList.csv");
            wordDocCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist.csv");
            wordDocCntHist2.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist2.csv");

            Console.Read();
        }
Пример #12
0
        public static void AnalyzeSearchWordSentiment(string indexPath, string field, string[] keywords, int printDocumentCnt = 10, string histogramField = null)
        {
            var searcher = LuceneOperations.GetIndexSearcher(indexPath);
            var reader   = searcher.GetIndexReader();
            var docIDs   = LuceneOperations.Search(searcher, StringOperations.GetMergedString(keywords, " "), field);

            Console.WriteLine("Find {0}% ({1}/{2}) documents containing: {3}", (100.0 * docIDs.Count / reader.NumDocs()), docIDs.Count, reader.NumDocs(), StringOperations.GetMergedString(keywords, " "));

            var              progress      = new ProgramProgress(docIDs.Count);
            var              sentiAnalyzer = new SentimentAnalyzer();
            SentimentType    sentimentType;
            double           sentimentScore;
            HeapSortDouble   hsdPos     = new HeapSortDouble(printDocumentCnt);
            HeapSortDouble   hsdNeg     = new HeapSortDouble(printDocumentCnt);
            Counter <string> counterPos = null;
            Counter <string> counterNeg = null;
            Counter <string> counterNeu = null;

            if (histogramField != null)
            {
                counterPos = new Counter <string>();
                counterNeg = new Counter <string>();
                counterNeu = new Counter <string>();
            }
            int posCnt = 0;
            int negCnt = 0;
            int neuCnt = 0;

            foreach (var docID in docIDs)
            {
                var document = reader.Document(docID);
                var content  = document.Get(field);
                sentiAnalyzer.GetSentiment(content, out sentimentType, out sentimentScore);

                switch (sentimentType)
                {
                case SentimentType.Positive:
                    posCnt++;
                    hsdPos.Insert(docID, Math.Abs(sentimentScore));
                    if (histogramField != null)
                    {
                        counterPos.Add(document.Get(histogramField));
                    }
                    break;

                case SentimentType.Negative:
                    negCnt++;
                    hsdNeg.Insert(docID, Math.Abs(sentimentScore));
                    if (histogramField != null)
                    {
                        counterNeg.Add(document.Get(histogramField));
                    }
                    break;

                case SentimentType.Neutral:
                    neuCnt++;
                    if (histogramField != null)
                    {
                        counterNeu.Add(document.Get(histogramField));
                    }
                    break;

                default:
                    throw new NotImplementedException();
                }

                progress.PrintIncrementExperiment();
            }

            Console.WriteLine("Positive document ratio {0}% ({1}/{2})", Math.Round(100.0 * posCnt / docIDs.Count), posCnt, docIDs.Count);
            Console.WriteLine("Negatvie document ratio {0}% ({1}/{2})", Math.Round(100.0 * negCnt / docIDs.Count), negCnt, docIDs.Count);
            Console.WriteLine("Neutral document ratio {0}% ({1}/{2})", Math.Round(100.0 * neuCnt / docIDs.Count), neuCnt, docIDs.Count);

            Console.WriteLine(StringOperations.WrapWithDash("Positive documents"));
            foreach (var kvp in hsdPos.GetSortedDictionary())
            {
                Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field));
            }

            Console.WriteLine(StringOperations.WrapWithDash("Negative documents"));
            foreach (var kvp in hsdNeg.GetSortedDictionary())
            {
                Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field));
            }

            progress.PrintTotalTime();

            if (histogramField != null)
            {
                string[]           featureStrings = new[] { "Pos", "Neg", "Neu" };
                Counter <string>[] counters       = new[] { counterPos, counterNeg, counterNeu };
                for (int i = 0; i < featureStrings.Length; i++)
                {
                    Console.WriteLine(StringOperations.WrapWithDash(histogramField + " " + featureStrings[i]));
                    int index = 0;
                    foreach (var kvp in counters[i].GetCountDictionary().OrderByDescending(kvp => kvp.Value))
                    {
                        Console.WriteLine(kvp.Key + "\t" + kvp.Value);
                        if (++index >= 100)
                        {
                            break;
                        }
                    }
                }
            }

            Console.ReadKey();
        }
        public void Transform(string inputFolder, string indexPath, HashSet <string> keywords)
        {
            Console.WriteLine("Start to search words: " + StringOperations.GetMergedString(keywords));
            Console.WriteLine("InputFolder: " + inputFolder + "\n");

            string notParseSpecString = "Temp-DoNotParse";

            inputFolder = StringOperations.EnsureFolderEnd(inputFolder);

            string[] schema = new[]
            {
                "CreatedAt", "Text", "IsRetweet", "Retweeted", "RetweetCount",
                "UserScreenName", "UserId", "UserFollowersCount", "UserFriendsCount"
            };
            var schemeDict            = Util.GetInvertedDictionary(schema);
            var textFieldIndex        = schemeDict["Text"];
            var createdTimeFieldIndex = schemeDict["CreatedAt"];
            var userIdFieldIndex      = schemeDict["UserId"];

            //string outputPath = inputFolder + notParseSpecString + "\\";
            //if (Directory.Exists(outputPath))
            //{
            //    Directory.Delete(outputPath, true);
            //}
            //Directory.CreateDirectory(outputPath);
            //var indexPath = outputPath + "Index\\";
            if (Directory.Exists(indexPath))
            {
                Directory.Delete(indexPath, true);
            }

            var files = Directory.GetFiles(inputFolder, "*.*", SearchOption.AllDirectories);

            //Preprocess
            Console.WriteLine("Start preprocesing...");
            ProgramProgress progress   = new ProgramProgress(files.Length);
            int             estiDocCnt = 0;

            foreach (var file in files)
            {
                estiDocCnt += FileOperations.GetLineCount(file);
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();
            Console.WriteLine("Estimate tweet count: " + estiDocCnt + "\n");

            //Parse
            Console.WriteLine("Start parsing...");

            var            indexWriter    = LuceneOperations.GetIndexWriter(indexPath);
            TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter);

            progress = new ProgramProgress(estiDocCnt);
            var sep             = new char[] { '\t' };
            int uniqDocFoundCnt = 0;
            int docFoundCnt     = 0;
            int docCnt          = 0;

            ThreeLayerHashSet <string, long, string> hash3Layer = new ThreeLayerHashSet <string, long, string>();
            int notUsedDocCnt = 0;

            foreach (var file in files)
            {
                if (file.Contains(notParseSpecString))
                {
                    continue;
                }

                if (file.EndsWith(".txt"))
                {
                    var    sr = new StreamReader(file);
                    string line;

                    while ((line = sr.ReadLine()) != null)
                    {
                        var tokens = line.Split(sep, StringSplitOptions.None);
                        if (tokens.Length != schema.Length)
                        {
                            notUsedDocCnt++;
                            continue;
                            //throw new ArgumentException();
                        }

                        var  words           = NLPOperations.Tokenize(tokens[textFieldIndex], tokenizeConfig);
                        bool isContainSearch = false;
                        foreach (var word in words)
                        {
                            if (keywords.Contains(word))
                            {
                                isContainSearch = true;
                                break;
                            }
                        }
                        if (isContainSearch)
                        {
                            string createdAt = tokens[createdTimeFieldIndex];
                            long   userId    = long.Parse(tokens[userIdFieldIndex]);
                            string text      = tokens[textFieldIndex];

                            if (!hash3Layer.Contains(createdAt, userId, text))
                            {
                                var document = new Document();
                                for (int i = 0; i < schema.Length; i++)
                                {
                                    document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED));
                                }
                                indexWriter.AddDocument(document);

                                hash3Layer.Add(createdAt, userId, text);

                                uniqDocFoundCnt++;
                            }
                            docFoundCnt++;
                        }
                        docCnt++;
                        progress.PrintIncrementExperiment(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%",
                                                                        uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, (docFoundCnt == 0 ? 0 : (100 * uniqDocFoundCnt / docFoundCnt))));
                    }

                    sr.Close();
                }
            }
            progress.PrintTotalTime();

            Console.WriteLine(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%",
                                            uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, 100 * uniqDocFoundCnt / docFoundCnt));
            Console.WriteLine("Not used doc count: " + notUsedDocCnt);

            Console.WriteLine("Start writing index...");
            indexWriter.Commit();
            indexWriter.Close();

            Console.WriteLine("Finish");
            Console.ReadKey();
        }
Пример #14
0
        public void Start()
        {
            Initialize();

            var reader = LuceneOperations.GetIndexReader(Configure.InputPath);

            InitializeWriters();

            var docNum   = reader.NumDocs();
            var progress = new ProgramProgress(docNum);

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                var  doc    = reader.Document(iDoc);
                bool isSkip = false;

                //random sample
                if (!isSkip && Configure.IsSampling)
                {
                    if (Random.NextDouble() > Configure.SampleRatio)
                    {
                        isSkip = true;
                    }
                }

                //filter by time
                if (!isSkip && Configure.IsSelectByTime)
                {
                    var dateTime = StringOperations.ParseDateTimeString(
                        doc.Get(Configure.TimeField), Configure.ParseTimeFormat);
                    if (dateTime.Subtract(StartDateTime).Ticks < 0 ||
                        dateTime.Subtract(EndDateTime).Ticks > 0)
                    {
                        isSkip = true;
                    }
                }

                //filter by exact match
                if (!isSkip && Configure.IsSelectByExactMatch)
                {
                    foreach (var kvp in Configure.FieldMatchDict)
                    {
                        if (doc.Get(kvp.Key) != kvp.Value)
                        {
                            isSkip = true;
                            break;
                        }
                    }
                }

                if (!isSkip)
                {
                    GetWriter(doc).AddDocument(doc);
                }

                progress.PrintIncrementExperiment();
            }

            CloseWriters();

            reader.Close();
        }
Пример #15
0
        public void TransformWithFileNames(string[] files, string indexPath, HashSet <string> searchHashSet, SearchSpinn3rType searchType)
        {
            double         tweetCnt       = 0;
            TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter);
            var            indexWriter    = LuceneOperations.GetIndexWriter(indexPath);

            var progress      = new ProgramProgress(files.Length);
            int docFoundCount = 0;
            int totalDocCount = 0;

            foreach (var file in files)
            {
                FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) =>
                {
                    tweetCnt += data.count;
                    //Console.WriteLine(data.count);
                    //Console.WriteLine(data.items[0].main);
                    foreach (var tweet in data.items)
                    {
                        if (tweet.lang != "en")
                        {
                            continue;
                        }

                        bool isContainSearch = false;
                        switch (searchType)
                        {
                        case SearchSpinn3rType.Main:
                            var words = NLPOperations.Tokenize(tweet.main, tokenizeConfig);
                            foreach (var word in words)
                            {
                                if (searchHashSet.Contains(word))
                                {
                                    isContainSearch = true;
                                    break;
                                }
                            }
                            break;

                        case SearchSpinn3rType.User:
                            isContainSearch = searchHashSet.Contains(tweet.author_link.ToLower());
                            break;

                        default:
                            throw new ArgumentException();
                        }

                        if (isContainSearch)
                        {
                            var document = new Document();
                            document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            indexWriter.AddDocument(document);
                            docFoundCount++;
                        }
                        totalDocCount++;
                    }
                });
                progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount));
            }
            progress.PrintTotalTime();

            Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount);

            Console.WriteLine("Start writing index...");
            indexWriter.Commit();
            indexWriter.Close();

            Util.ProgramFinishHalt();
        }
Пример #16
0
        private List <int> RemoveSimilarDocumentsGranu(IndexReader reader, List <int> orgDocIDs,
                                                       int timeWindowSize, int wordWindowSize)
        {
            var newDocIDs        = new List <int>();
            var removeSimilarity = Configure.MinDistinctiveDocumentCosine;

            Dictionary <int, Dictionary <int, List <SparseVectorList> > > uniqueDocHash = new Dictionary <int, Dictionary <int, List <SparseVectorList> > >();
            int docNum = orgDocIDs.Count;

            int removeDocNum = 0;
            Dictionary <string, int> lexicon = new Dictionary <string, int>();

            int timeslicesize = 1;

            if (timeWindowSize >= 15)
            {
                int[] dividePieceNumbers = new int[] { 3, 4, 5, 7 };
                foreach (int dividePieceNumber in dividePieceNumbers)
                {
                    if (timeWindowSize % dividePieceNumber == 0)
                    {
                        timeslicesize = timeWindowSize / dividePieceNumber;
                        break;
                    }
                }
                if (timeslicesize == 1)
                {
                    timeslicesize  = (timeWindowSize + 2) / 3;
                    timeWindowSize = 3;
                }
                else
                {
                    timeWindowSize /= timeslicesize;
                }
                Console.WriteLine("Reset window size! TimeSliceSize: {0}, WindowSize: {1}", timeslicesize, timeWindowSize);
            }
            int begintimedelta = -(timeWindowSize - 1) / 2;
            int endtimedelta   = timeWindowSize / 2;
            var progress       = new ProgramProgress(docNum);

            StreamWriter debugSw = null;

            if (Configure.IsDebug)
            {
                string fileName = Configure.OutputPath + "debug.txt";
                FileOperations.EnsureFileFolderExist(fileName);
                debugSw = new StreamWriter(fileName, true, Encoding.UTF8);
            }

            foreach (var iDoc in orgDocIDs)
            {
                var doc = reader.Document(iDoc);
                SparseVectorList vector = GetFeatureVector(doc, lexicon);
                if (vector == null)
                {
                    removeDocNum++;
                    continue;
                }

                vector.documentid = iDoc;
                int   time    = getDateTimeBingNews(doc) / timeslicesize;
                int[] words   = getMostFreqWordIndex(vector, wordWindowSize);
                bool  bunqiue = true;
                for (int stime = time + begintimedelta; stime <= time + endtimedelta; stime++)
                {
                    if (uniqueDocHash.ContainsKey(stime))
                    {
                        Dictionary <int, List <SparseVectorList> > wordHash = uniqueDocHash[stime];
                        foreach (int sword in words)
                        {
                            if (wordHash.ContainsKey(sword))
                            {
                                List <SparseVectorList> vectorList = wordHash[sword];
                                foreach (SparseVectorList svector in vectorList)
                                {
                                    if (SparseVectorList.Cosine(svector, vector) >= removeSimilarity)
                                    {
                                        if (Configure.IsDebug && removeDocNum <= 10000)
                                        {
                                            double simi = SparseVectorList.Cosine(svector, vector);
                                            if (simi <= Configure.MaxShowDebugCosine)
                                            {
                                                debugSw.WriteLine("---------------------------------------------------");
                                                debugSw.WriteLine(reader.Document(svector.documentid).Get(BingNewsFields.NewsArticleHeadline)); //Get("NewsArticleDescription"));
                                                debugSw.WriteLine(reader.Document(vector.documentid).Get(BingNewsFields.NewsArticleHeadline));  //Get("NewsArticleDescription"));
                                                debugSw.WriteLine("");
                                                string body1 = reader.Document(svector.documentid).Get(BingNewsFields.NewsArticleDescription);
                                                string body2 = reader.Document(vector.documentid).Get(BingNewsFields.NewsArticleDescription);
                                                if (body1.Length > 100)
                                                {
                                                    body1 = body1.Substring(0, 100);
                                                }
                                                if (body2.Length > 100)
                                                {
                                                    body2 = body2.Substring(0, 100);
                                                }
                                                debugSw.WriteLine(body1);
                                                debugSw.WriteLine(body2);
                                                debugSw.WriteLine(simi);
                                            }
                                            debugSw.Flush();
                                        }
                                        bunqiue = false;
                                        break;
                                    }
                                }
                            }
                            if (!bunqiue)
                            {
                                break;
                            }
                        }
                    }
                    if (!bunqiue)
                    {
                        break;
                    }
                }

                if (bunqiue)
                {
                    int keytime = time;
                    int keyword = words[0];
                    if (!uniqueDocHash.ContainsKey(keytime))
                    {
                        uniqueDocHash.Add(keytime, new Dictionary <int, List <SparseVectorList> >());
                    }
                    Dictionary <int, List <SparseVectorList> > wordHash = uniqueDocHash[keytime];
                    if (!wordHash.ContainsKey(keyword))
                    {
                        wordHash.Add(keyword, new List <SparseVectorList>());
                    }
                    List <SparseVectorList> list = wordHash[keyword];
                    list.Add(vector);

                    newDocIDs.Add(iDoc);
                }
                else
                {
                    removeDocNum++;
                }

                progress.PrintIncrementExperiment();
            }

            Console.WriteLine("Finished remove similar documents. Removed {0} out of {1}", removeDocNum, docNum);

            int listLengthSum = 0, listCnt = 0;

            foreach (Dictionary <int, List <SparseVectorList> > hash0 in uniqueDocHash.Values)
            {
                foreach (List <SparseVectorList> list in hash0.Values)
                {
                    listLengthSum += list.Count;
                    listCnt++;
                }
            }
            Console.WriteLine("AvgListLength: {0}, ListCnt: {1}", listLengthSum / listCnt, listCnt);

            if (Configure.IsDebug)
            {
                debugSw.Flush();
                debugSw.Close();
            }

            return(newDocIDs);
        }
Пример #17
0
        public void Start()
        {
            string inputPath  = @"D:\DataProcess\TweetIndex\tweets-Ebola-20150101-20150228_dedup\";
            string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter2\";

            var indexReader = LuceneOperations.GetIndexReader(inputPath);
            var indexWriter = LuceneOperations.GetIndexWriter(outputPath);

            char[]   seperator = new char[] { ' ' };
            string[] aidFields = new string[] { "User_FollowersCount", "User_Name", "User_ScreenName",
                                                "Retweet", "Mention" };
            ProgramProgress progress = new ProgramProgress(indexReader.NumDocs());

            //for (int iDoc = 0; iDoc < 1000; iDoc++)
            for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++)
            {
                Document inDoc  = indexReader.Document(iDoc);
                Document outDoc = new Document();

                string   inTime   = inDoc.Get("CreateAt");
                DateTime dateTime = DateTime.Parse(inTime);
                outDoc.Add(new Field(BingNewsFields.DiscoveryStringTime, dateTime.ToString(BingNewsFields.TimeFormat), Field.Store.YES, Field.Index.ANALYZED));

                string hashtag = inDoc.Get("Hashtag");
                string word    = inDoc.Get("Word");
                if (hashtag == null)
                {
                    hashtag = "";
                }
                var hashtagTokens = hashtag.Split(seperator, StringSplitOptions.RemoveEmptyEntries);
                var wordTokens    = word.Split(seperator, StringSplitOptions.RemoveEmptyEntries);

                string title = hashtagTokens.Length > 0 ? hashtagTokens[0] : wordTokens.Length > 0 ? wordTokens[0] : "";
                outDoc.Add(new Field(BingNewsFields.NewsArticleHeadline, title, Field.Store.YES, Field.Index.ANALYZED));

                outDoc.Add(new Field(BingNewsFields.NewsArticleDescription, inDoc.Get("Text"), Field.Store.YES, Field.Index.ANALYZED));

                string           featureVector = "";
                Counter <string> counter       = new Counter <string>();
                foreach (var tag in hashtagTokens)
                {
                    counter.Add(tag);
                    counter.Add(tag);
                }
                foreach (var w in wordTokens)
                {
                    counter.Add(w);
                }
                foreach (var kvp in counter.GetSortedCountDictioanry())
                {
                    featureVector += string.Format("{0}({1})\\n", kvp.Key, kvp.Value);
                }
                outDoc.Add(new Field(BingNewsFields.FeatureVector, featureVector, Field.Store.YES, Field.Index.ANALYZED));

                outDoc.Add(new Field(BingNewsFields.DocId, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                outDoc.Add(new Field(BingNewsFields.DocumentURL, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED));

                foreach (var aidField in aidFields)
                {
                    var value = inDoc.Get(aidField);
                    outDoc.Add(new Field(aidField, value == null ? "" : value, Field.Store.YES, Field.Index.ANALYZED));
                }

                indexWriter.AddDocument(outDoc);

                progress.PrintIncrementExperiment();
            }

            indexWriter.Optimize();
            indexWriter.Close();

            indexReader.Close();
        }
Пример #18
0
        //public void StartEbola(int[] focusSeeds)
        public void StartEbola()
        {
            // -- node counts --
            string folder    = @"D:\Project\StreamingRoseRiver\EbolaCaseStudyFinal\Trees3\";
            string exeFolder = @"D:\Project\StreamingRoseRiver\EbolaCaseStudyFinal\RoseRiver\RoseRiver\bin\x64\Release\";

            if (!Directory.Exists(folder))
            {
                folder    = @"H:\Xiting\StreamingRoseRiver\ScalabilityExperiment\Data\Trees3\";
                exeFolder = @"H:\Xiting\StreamingRoseRiver\ScalabilityExperiment\RoseRiverExe\";
            }
            if (!Directory.Exists(folder))
            {
                folder    = @"D:\Documents\roseriver\RoseRiver\RoseRiver\Data\Ebola\Trees3\";
                exeFolder = @"D:\Documents\roseriver\RoseRiver\RoseRiver\Data\Ebola\ScalabilityExperiment\RoseRiver\RoseRiver\bin\x64\Release\";
            }

            List <int> nodeCounts = new List <int>();

            for (int i = 0; i < 30; i++)
            {
                var fileName = folder + i + ".gv";
                var tree     = BRTAnalysis.ReadTree(fileName);
                nodeCounts.Add(tree.BFS(tree.Root).Count());
            }

            // -- experiment --
            var copyFactors = new[] { 1 };              //Util.GetIntArray(1, 9, 2); //new[] {1, 2, 5, 10, 20, 50};
            var focusCounts = new[] { 1, 3, 5 };        //DataProcess.Utils.Util.GetIntArray(1, 5);
            //var focusSampleCount = 1;//50;
            var focusSeeds = Util.GetIntArray(51, 100); //Util.GetIntArray(1, 50); //new[] { 1 };//Util.GetIntArray(1, 50);
            //var minMaxTreeCount = 10;
            //var maxMaxTreeCount = 30;
            var treeCounts = Util.GetIntArray(5, 30); //new int[] { 5, 10 };//new[] {10, 20};
            int index      = 0;

            ProgramProgress progress =
                new ProgramProgress(copyFactors.Length * focusCounts.Length * focusSeeds.Length * treeCounts.Length);
            var configure = new TopicStreamConfigure();

            foreach (int focusSeed in focusSeeds)
            {
                foreach (var copyFactor in copyFactors)
                {
                    configure.CopyFactor = copyFactor;
                    foreach (var focusCount in focusCounts)
                    {
                        configure.FocusCount     = focusCount;
                        configure.DefaultTreeCut = GetRandomManuallyTreeCut(focusCount, treeCounts.Min(), focusSeed,
                                                                            nodeCounts, 1);
                        configure.DefaultTreeCutRandomSeed = focusSeed;
                        foreach (var treeCount in treeCounts)
                        {
                            if (File.Exists("RunTimeExperiment\\" + index + ".txt"))
                            {
                                Console.WriteLine("Skip index = " + index);
                                index++;
                                progress.PrintSkipExperiment();
                                continue;
                            }

                            configure.TreeCount = treeCount;
                            configure.Index     = index;
                            configure.Write();

                            File.Copy(TopicStreamConfigure.ConfigureFileName,
                                      exeFolder + TopicStreamConfigure.ConfigureFileName, true);

                            ProcessStartInfo startInfo = new ProcessStartInfo();
                            startInfo.ErrorDialog     = false;
                            startInfo.CreateNoWindow  = false;
                            startInfo.UseShellExecute = false;
                            startInfo.FileName        = exeFolder + @"RoseRiver.exe";
                            startInfo.WindowStyle     = ProcessWindowStyle.Normal;

                            using (Process exeProcess = Process.Start(startInfo))
                            {
                                exeProcess.WaitForExit();
                            }

                            progress.PrintIncrementExperiment("\n");
                            index++;
                        }
                    }
                }
            }

            progress.PrintTotalTime();
        }