Пример #1
0
 private void InitializeWriters()
 {
     if (Configure.IsSplitByTime)
     {
         _dateTransferFunc = str =>
         {
             var dateTime = StringOperations.ParseDateTimeString(str, _dateFormatString);
             if (Configure.SplitDayCount == 7)
             {
                 dateTime = dateTime.Subtract(TimeSpan.FromDays((int)dateTime.DayOfWeek));
             }
             else
             {
                 var days        = dateTime.Subtract(_minDateTime).TotalDays;
                 var residueDays = days % Configure.SplitDayCount;
                 dateTime = dateTime.Subtract(TimeSpan.FromDays(residueDays));
             }
             return(dateTime.ToString("yyyy-MM-dd"));
         };
     }
     else
     {
         IndexWriter writer = LuceneOperations.GetIndexWriter(Configure.OutputPath);
         _writers.Add("", writer);
     }
 }
Пример #2
0
        public void StartTransformTweetIndexForStreamingRoseRiver()
        {
            string inputPath  = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01\";
            string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01_MOD\";

            var indexReader = LuceneOperations.GetIndexReader(inputPath);
            var indexWriter = LuceneOperations.GetIndexWriter(outputPath);

            string          docIDField = BingNewsFields.DocId;
            string          urlField   = BingNewsFields.DocumentURL;
            ProgramProgress progress   = new ProgramProgress(indexReader.NumDocs());

            for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++)
            {
                Document inDoc  = indexReader.Document(iDoc);
                Document outDoc = inDoc;

                outDoc.RemoveField(docIDField);
                outDoc.Add(new Field(docIDField, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED));

                outDoc.RemoveField(urlField);
                outDoc.Add(new Field(urlField, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED));

                indexWriter.AddDocument(inDoc);
                progress.PrintIncrementExperiment();
            }


            indexWriter.Optimize();
            indexWriter.Close();

            indexReader.Close();
        }
Пример #3
0
        public void Start()
        {
            var         reader = LuceneOperations.GetIndexReader(Configure.InputPath);
            var         sw     = new StreamWriter(Configure.OutputPath);
            IndexWriter writer = null;

            if (Configure.IsFilterByWordCount)
            {
                writer = LuceneOperations.GetIndexWriter(Configure.FilterWordCountIndexPath);
            }
            if (Configure.IsLoadFromFeatureVector)
            {
                Configure.TokenizeConfig.TokenizerType = TokenizerType.FeatureVector;
            }

            Console.WriteLine("Total: " + reader.NumDocs());
            int docIndex = 0;

            for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
            {
                if (iDoc % 10000 == 0)
                {
                    Console.WriteLine(iDoc);
                    sw.Flush();
                }

                string content = Configure.IsLoadFromFeatureVector ? reader.Document(iDoc).Get(BingNewsFields.FeatureVector) :
                                 LuceneOperations.GetDocumentContent(reader.Document(iDoc), Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);

                List <string> words      = NLPOperations.Tokenize(content, Configure.TokenizeConfig);;
                bool          isPrintDoc = !Configure.IsFilterByWordCount || words.Count >= Configure.MinWordCount;
                if (isPrintDoc)
                {
                    if (Configure.IsFilterByWordCount)
                    {
                        writer.AddDocument(reader.Document(iDoc));
                    }

                    sw.Write(docIndex + " " + docIndex + " ");

                    foreach (var word in words)
                    {
                        sw.Write(word + " ");
                    }
                    sw.Write("\n");

                    docIndex++;
                }
            }

            if (Configure.IsFilterByWordCount)
            {
                writer.Optimize();
                writer.Close();
            }

            sw.Flush();
            sw.Close();
            reader.Close();
        }
Пример #4
0
        public void Start()
        {
            var writer = LuceneOperations.GetIndexWriter(OutputPath);

            var totalDocCnt = 0;

            foreach (var inputPath in InputPaths)
            {
                var reader = LuceneOperations.GetIndexReader(inputPath);
                totalDocCnt += reader.NumDocs();
                reader.Close();
            }

            var progress = new ProgramProgress(totalDocCnt);

            foreach (var inputPath in InputPaths)
            {
                var reader = LuceneOperations.GetIndexReader(inputPath);
                for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
                {
                    writer.AddDocument(reader.Document(iDoc));
                    progress.PrintIncrementExperiment();
                }
                reader.Close();
            }

            writer.Optimize();
            writer.Close();
        }
Пример #5
0
        /// <summary>
        /// Twitter data: from cosmos, each line represents a Tweet.
        /// Different fields are seperated by '\t'. The schema is the name for each field
        /// </summary>
        private void BuildFromTwitterTxt()
        {
            string inputpath  = TwitterConfigure.InputPath;
            string outputpath = TwitterConfigure.OutputPath;
            var    schema     = TwitterConfigure.TwitterSchema;
            string bodyField  = TwitterConfigure.TwitterBodyField;

            var indexwriter = LuceneOperations.GetIndexWriter(outputpath);

            StreamReader sr = new StreamReader(inputpath);
            string       line;
            int          lineCnt = 0;

            while ((line = sr.ReadLine()) != null)
            {
                lineCnt++;
            }
            //Console.WriteLine("Total Lines: " + lineCnt);
            sr.Close();

            sr = new StreamReader(inputpath);
            var seperator = new char[] { '\t' };
            int lineIndex = 0;
            var progress  = new ProgramProgress(lineCnt);

            while ((line = sr.ReadLine()) != null)
            {
                //if (lineIndex % 100000 == 0)
                //    Console.WriteLine("{0} out of {1} ({2}%)", lineIndex, lineCnt, 100 * lineIndex / lineCnt);

                var tokens = line.Split(seperator);//, StringSplitOptions.RemoveEmptyEntries);
                if (tokens.Length != schema.Length)
                {
                    throw new Exception("Unmatch schema");
                }
                var document = new Document();
                for (int i = 0; i < tokens.Length; i++)
                {
                    if (schema[i] == bodyField)
                    {
                        tokens[i] = RemoveContentNoise.RemoveTweetIndexNoise(tokens[i]);
                    }
                    document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED));
                }
                indexwriter.AddDocument(document);

                lineIndex++;
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            sr.Close();

            indexwriter.Optimize();
            indexwriter.Close();
        }
Пример #6
0
        public void TransformWithFileNameContentSearch(string[] files, string indexPath, string searchStr, string progressEndStr = null)
        {
            double tweetCnt    = 0;
            var    indexWriter = LuceneOperations.GetIndexWriter(indexPath);

            searchStr = searchStr.ToLower();

            var progress      = new ProgramProgress(files.Length);
            int docFoundCount = 0;
            int totalDocCount = 0;

            foreach (var file in files)
            {
                FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) =>
                {
                    tweetCnt += data.count;
                    //Console.WriteLine(data.count);
                    //Console.WriteLine(data.items[0].main);
                    foreach (var tweet in data.items)
                    {
                        if (tweet.lang != "en")
                        {
                            continue;
                        }

                        if (tweet.main.ToLower().Contains(searchStr))
                        {
                            var document = new Document();
                            document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            indexWriter.AddDocument(document);
                            docFoundCount++;
                        }
                        totalDocCount++;
                    }
                });
                progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%) -- {3}", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount, progressEndStr));
            }
            progress.PrintTotalTime();

            Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount);

            Console.WriteLine("Start writing index...");
            indexWriter.Commit();
            indexWriter.Close();

            //Util.ProgramFinishHalt();
        }
Пример #7
0
        public void Start()
        {
            string debugFileName = Configure.OutputPath + _debugFileName;

            if (File.Exists(debugFileName))
            {
                File.Delete(debugFileName);
            }

            var        reader = LuceneOperations.GetIndexReader(Configure.InputPath);
            List <int> docIDs = new List <int>();

            for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
            {
                docIDs.Add(iDoc);
            }

            if (Configure.IsRemoveSameURL)
            {
                Console.WriteLine("=====================RemoveSameURL=====================");
                docIDs = RemoveSameURLDocument(reader, docIDs);
            }

            if (Configure.IsRemoveSimilarContent)
            {
                for (int iGranu = 0; iGranu < Configure.RemoveDateGranularity.Length; iGranu++)
                {
                    int timeGranu = Configure.RemoveDateGranularity[iGranu];
                    int wordGranu = Configure.RemoveWordGranularity[iGranu];

                    Console.WriteLine("========Remove Similar Document: {0} out of {1}, Granu: {2} {3}========",
                                      iGranu, Configure.RemoveDateGranularity.Length, timeGranu, wordGranu);

                    docIDs = RemoveSimilarDocumentsGranu(reader, docIDs, timeGranu, wordGranu);
                }
            }

            var writer = LuceneOperations.GetIndexWriter(Configure.OutputPath);

            foreach (var docID in docIDs)
            {
                writer.AddDocument(reader.Document(docID));
            }

            writer.Optimize();
            writer.Close();
            reader.Close();

            Console.WriteLine("All done");
            //Console.ReadKey();
        }
Пример #8
0
 private IndexWriter GetWriter(Document doc)
 {
     if (!Configure.IsSplitByTime)
     {
         return(_writers.Values.First());
     }
     else
     {
         var         dateTime = StringOperations.ParseDateTimeString(doc.Get(Configure.TimeField), Configure.ParseTimeFormat);
         string      projDate = _dateTransferFunc(dateTime.ToString(_dateFormatString));
         IndexWriter writer;
         if (!_writers.TryGetValue(projDate, out writer))
         {
             string path = StringOperations.EnsureFolderEnd(Configure.OutputPath) + projDate;
             writer             = LuceneOperations.GetIndexWriter(path);
             _writers[projDate] = writer;
         }
         return(writer);
     }
 }
Пример #9
0
        private void BuildFromWeiboWebPages()
        {
            var indexWriter = LuceneOperations.GetIndexWriter(WeiboConfigure.OutputPath);

            //int totalWeiboCount = 0;
            //int totalFileCount = 0;
            foreach (var filename in Directory.EnumerateFiles(WeiboConfigure.InputPath, "*.txt", SearchOption.AllDirectories))
            {
                if (Path.GetFileName(filename).StartsWith("_"))
                {
                    continue;
                }
                var parser = new WeiboParser(filename);
                foreach (var weibo in parser.GetContainedWeibo())
                {
                    Document doc = new Document();
                    doc.Add(new Field(WeiboLuceneFields.UserNickName, weibo.UserNickName, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(WeiboLuceneFields.UserID, weibo.UserID, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(WeiboLuceneFields.NewsArticleDescription, weibo.Content, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(WeiboLuceneFields.DiscoveryStringTime, weibo.Time, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(WeiboLuceneFields.Source, weibo.Source, Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(WeiboLuceneFields.UpCount, weibo.UpCount.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(WeiboLuceneFields.ForwardCount, weibo.ForwardCount.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(WeiboLuceneFields.CollectCount, weibo.CollectCount.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                    doc.Add(new Field(WeiboLuceneFields.ReplyCount, weibo.ReplyCount.ToString(), Field.Store.YES, Field.Index.ANALYZED));

                    indexWriter.AddDocument(doc);
                }
                //Console.WriteLine(filename);
                //var cnt = parser.GetContainedWeibo().Count;
                //Console.WriteLine(cnt);
                //totalWeiboCount += cnt;
                //totalFileCount++;
            }

            //Console.WriteLine("Total count:" + totalWeiboCount);
            //Console.WriteLine("Total file count: " + totalFileCount);

            indexWriter.Optimize();
            indexWriter.Close();
        }
Пример #10
0
        private void BuildFromBingNewsXMLs()
        {
            string[] selectedFields = new string[] {
                "DocumentURL", "DocumentUrl", "Country", "NewsArticleCategoryData",
                "NewsArticleHeadline", "NewsArticleDescription",
                "DiscoveryStringTime", "PublishedDateTime",
                "DownloadStringTime", "PublishedDateTime", "NewsSource"
            };                                                            //NewsArticleBodyNEMap, RealTimeType

            List <string>   bingnewspaths = BingNewsConfigure.BingNewsPaths;
            int             iProcessor    = BingNewsConfigure.iProcessor;
            int             processorNum  = BingNewsConfigure.ProcessorNum;
            string          startdate     = BingNewsConfigure.StartDate;
            string          enddate       = BingNewsConfigure.EndDate;
            List <string[]> keywordLists  = BingNewsConfigure.KeywordLists;
            List <string>   indexpaths    = BingNewsConfigure.IndexPaths;
            List <string>   languages     = BingNewsConfigure.Languages;
            int             maxThreadNum  = BingNewsConfigure.MaxThreadNum;

            //LoadExtractBingNewsDataConfig_KeyWordList(out bingnewspaths,
            //    out iProcessor, out processorNum, out startdate, out enddate,
            //    out keywordLists, out languages, out indexpaths);

            List <string> outputdirs    = new List <string>();
            List <string> infofilenames = new List <string>();
            int           ikeyword2     = 0;

            foreach (string indexpath in indexpaths)
            {
                string outputdir = indexpath + "BingNews_" + keywordLists[ikeyword2][0] + "_" + iProcessor + "_" + processorNum;
                if (!Directory.Exists(outputdir))
                {
                    Directory.CreateDirectory(outputdir);
                }
                infofilenames.Add(indexpath + "BingNews_" + keywordLists[ikeyword2][0] + "_" + iProcessor + "_" + processorNum + ".dat");
                outputdirs.Add(outputdir);
                ikeyword2++;
            }

            List <IndexWriter>  indexwriters = new List <IndexWriter>();
            List <StreamWriter> infofiles    = new List <StreamWriter>();

            for (ikeyword2 = 0; ikeyword2 < keywordLists.Count; ikeyword2++)
            {
                IndexWriter  indexwriter = LuceneOperations.GetIndexWriter(outputdirs[ikeyword2]);
                StreamWriter infofile    = new StreamWriter(infofilenames[ikeyword2]);
                indexwriters.Add(indexwriter);
                infofiles.Add(infofile);
            }

            List <string> allfilenames = new List <string>();

            foreach (var bingnewpath in bingnewspaths)
            {
                allfilenames.AddRange(Directory.GetFiles(bingnewpath, "*.*", System.IO.SearchOption.AllDirectories));
            }
            allfilenames = FilterDates(allfilenames, startdate, enddate).ToList();
            List <string> filenames = new List <string>();

            for (int i = iProcessor; i < allfilenames.Count; i += processorNum)
            {
                filenames.Add(allfilenames[i]);
            }

            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            ProgramProgress progress = new ProgramProgress(filenames.Count);

            //ProgramProgress progress = new ProgramProgress(filenames.Count / processorNum);

            int[] newsfoundcnts = new int[keywordLists.Count];

            DateTime time_begin_1 = DateTime.Now;

            //for (int ifilename = iProcessor; ifilename < filenames.Count; ifilename += processorNum)

            if (maxThreadNum == 1)
            {
                foreach (var filename in filenames)
                {
                    BuildLuceneFromFile(filename, keywordLists, indexwriters, languages, selectedFields, newsfoundcnts, infofiles, progress);
                }
            }
            else
            {
                ParallelOptions options = new ParallelOptions();
                options.MaxDegreeOfParallelism = maxThreadNum;
                object obj = new Object();

                Parallel.ForEach(filenames, options, filename => BuildLuceneFromFile(filename, keywordLists, indexwriters, languages, selectedFields, newsfoundcnts, infofiles, progress));
            }

            for (ikeyword2 = 0; ikeyword2 < keywordLists.Count; ikeyword2++)
            {
                infofiles[ikeyword2].WriteLine("Extract xml time\t" + stopwatch.Elapsed);
            }

            Console.WriteLine("Start writing to lucene index...");

            Stopwatch stopwatch2 = new Stopwatch();

            stopwatch2.Start();

            for (ikeyword2 = 0; ikeyword2 < keywordLists.Count; ikeyword2++)
            {
                indexwriters[ikeyword2].Optimize();
                indexwriters[ikeyword2].Close();
            }

            for (ikeyword2 = 0; ikeyword2 < keywordLists.Count; ikeyword2++)
            {
                infofiles[ikeyword2].WriteLine("Write to lucene index time\t" + stopwatch2.Elapsed);
                infofiles[ikeyword2].WriteLine("Total time\t" + stopwatch.Elapsed);
                infofiles[ikeyword2].Flush();
                infofiles[ikeyword2].Close();
            }
        }
        public void Transform(string inputFolder, string indexPath, HashSet <string> keywords)
        {
            Console.WriteLine("Start to search words: " + StringOperations.GetMergedString(keywords));
            Console.WriteLine("InputFolder: " + inputFolder + "\n");

            string notParseSpecString = "Temp-DoNotParse";

            inputFolder = StringOperations.EnsureFolderEnd(inputFolder);

            string[] schema = new[]
            {
                "CreatedAt", "Text", "IsRetweet", "Retweeted", "RetweetCount",
                "UserScreenName", "UserId", "UserFollowersCount", "UserFriendsCount"
            };
            var schemeDict            = Util.GetInvertedDictionary(schema);
            var textFieldIndex        = schemeDict["Text"];
            var createdTimeFieldIndex = schemeDict["CreatedAt"];
            var userIdFieldIndex      = schemeDict["UserId"];

            //string outputPath = inputFolder + notParseSpecString + "\\";
            //if (Directory.Exists(outputPath))
            //{
            //    Directory.Delete(outputPath, true);
            //}
            //Directory.CreateDirectory(outputPath);
            //var indexPath = outputPath + "Index\\";
            if (Directory.Exists(indexPath))
            {
                Directory.Delete(indexPath, true);
            }

            var files = Directory.GetFiles(inputFolder, "*.*", SearchOption.AllDirectories);

            //Preprocess
            Console.WriteLine("Start preprocesing...");
            ProgramProgress progress   = new ProgramProgress(files.Length);
            int             estiDocCnt = 0;

            foreach (var file in files)
            {
                estiDocCnt += FileOperations.GetLineCount(file);
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();
            Console.WriteLine("Estimate tweet count: " + estiDocCnt + "\n");

            //Parse
            Console.WriteLine("Start parsing...");

            var            indexWriter    = LuceneOperations.GetIndexWriter(indexPath);
            TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter);

            progress = new ProgramProgress(estiDocCnt);
            var sep             = new char[] { '\t' };
            int uniqDocFoundCnt = 0;
            int docFoundCnt     = 0;
            int docCnt          = 0;

            ThreeLayerHashSet <string, long, string> hash3Layer = new ThreeLayerHashSet <string, long, string>();
            int notUsedDocCnt = 0;

            foreach (var file in files)
            {
                if (file.Contains(notParseSpecString))
                {
                    continue;
                }

                if (file.EndsWith(".txt"))
                {
                    var    sr = new StreamReader(file);
                    string line;

                    while ((line = sr.ReadLine()) != null)
                    {
                        var tokens = line.Split(sep, StringSplitOptions.None);
                        if (tokens.Length != schema.Length)
                        {
                            notUsedDocCnt++;
                            continue;
                            //throw new ArgumentException();
                        }

                        var  words           = NLPOperations.Tokenize(tokens[textFieldIndex], tokenizeConfig);
                        bool isContainSearch = false;
                        foreach (var word in words)
                        {
                            if (keywords.Contains(word))
                            {
                                isContainSearch = true;
                                break;
                            }
                        }
                        if (isContainSearch)
                        {
                            string createdAt = tokens[createdTimeFieldIndex];
                            long   userId    = long.Parse(tokens[userIdFieldIndex]);
                            string text      = tokens[textFieldIndex];

                            if (!hash3Layer.Contains(createdAt, userId, text))
                            {
                                var document = new Document();
                                for (int i = 0; i < schema.Length; i++)
                                {
                                    document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED));
                                }
                                indexWriter.AddDocument(document);

                                hash3Layer.Add(createdAt, userId, text);

                                uniqDocFoundCnt++;
                            }
                            docFoundCnt++;
                        }
                        docCnt++;
                        progress.PrintIncrementExperiment(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%",
                                                                        uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, (docFoundCnt == 0 ? 0 : (100 * uniqDocFoundCnt / docFoundCnt))));
                    }

                    sr.Close();
                }
            }
            progress.PrintTotalTime();

            Console.WriteLine(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%",
                                            uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, 100 * uniqDocFoundCnt / docFoundCnt));
            Console.WriteLine("Not used doc count: " + notUsedDocCnt);

            Console.WriteLine("Start writing index...");
            indexWriter.Commit();
            indexWriter.Close();

            Console.WriteLine("Finish");
            Console.ReadKey();
        }
Пример #12
0
        public void Start()
        {
            if (!outputpath.EndsWith("\\"))
            {
                outputpath += "\\";
            }

            var tokenizerConfig = new TokenizeConfig(tokenizeConfigStr);

            var searcher    = LuceneOperations.GetIndexSearcher(inputpath);
            var max_doc_num = (int)(searchDocRatio * searcher.GetIndexReader().NumDocs());
            var scoredDocs  = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num);

            int  iter      = 0;
            bool bContinue = threshold == 0 ? false : true;

            while (bContinue && iter < 5)
            {
                iter++;
                Console.WriteLine("iteration------------------" + iter);
                List <string> keywordsNew;
                #region Calculate Keywords
                var counter = new Counter <string>();
                foreach (var scoredDoc in scoredDocs)
                {
                    var doc     = searcher.Doc(scoredDoc.doc);
                    var content = doc.Get(searchfield);
                    foreach (var word in NLPOperations.Tokenize(content, tokenizerConfig))
                    {
                        counter.Add(word);
                    }
                }
                keywordsNew = counter.GetMostFreqObjs(keywordNum);
                #endregion

                var scoredDocsNew = LuceneOperations.Search(searcher, searchfield, keywordsNew, max_doc_num);
                #region Test whether exit
                int repeatNum = 0;
                var docIDs    = new HashSet <int>();
                foreach (var scoredDoc in scoredDocs)
                {
                    docIDs.Add(scoredDoc.doc);
                }

                foreach (var scoredDocNew in scoredDocsNew)
                {
                    if (docIDs.Contains(scoredDocNew.doc))
                    {
                        repeatNum++;
                    }
                }

                bContinue = (double)repeatNum / scoredDocs.Length < threshold;
                #endregion

                Console.WriteLine(repeatNum + "  " + scoredDocsNew.Length);

                keywords   = keywordsNew;
                scoredDocs = scoredDocsNew;

                Console.WriteLine(StringOperations.GetMergedString(keywords));
            }

            max_doc_num = (int)(saveDocRatio * searcher.GetIndexReader().NumDocs());
            scoredDocs  = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num);
            var writer = LuceneOperations.GetIndexWriter(outputpath);
            foreach (var scoredDoc in scoredDocs)
            {
                Document doc = searcher.Doc(scoredDoc.doc);
                writer.AddDocument(doc);
            }
            writer.Optimize();
            writer.Close();

            if (isPrintRemovedDocuments)
            {
                var sw             = new StreamWriter(outputpath + "removeDocuments.txt");
                var selectedDocIDs = new HashSet <int>();
                foreach (var scoredDoc in scoredDocs)
                {
                    selectedDocIDs.Add(scoredDoc.doc);
                }

                var reader = searcher.GetIndexReader();
                for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
                {
                    if (!selectedDocIDs.Contains(iDoc))
                    {
                        sw.WriteLine(LuceneOperations.GetDocumentString(reader.Document(iDoc)));
                    }
                }
                reader.Close();
                sw.Flush();
                sw.Close();
            }

            searcher.Close();

            Console.WriteLine("Done");
            Console.ReadKey();
        }
Пример #13
0
        public void TransformWithFileNames(string[] files, string indexPath, HashSet <string> searchHashSet, SearchSpinn3rType searchType)
        {
            double         tweetCnt       = 0;
            TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter);
            var            indexWriter    = LuceneOperations.GetIndexWriter(indexPath);

            var progress      = new ProgramProgress(files.Length);
            int docFoundCount = 0;
            int totalDocCount = 0;

            foreach (var file in files)
            {
                FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) =>
                {
                    tweetCnt += data.count;
                    //Console.WriteLine(data.count);
                    //Console.WriteLine(data.items[0].main);
                    foreach (var tweet in data.items)
                    {
                        if (tweet.lang != "en")
                        {
                            continue;
                        }

                        bool isContainSearch = false;
                        switch (searchType)
                        {
                        case SearchSpinn3rType.Main:
                            var words = NLPOperations.Tokenize(tweet.main, tokenizeConfig);
                            foreach (var word in words)
                            {
                                if (searchHashSet.Contains(word))
                                {
                                    isContainSearch = true;
                                    break;
                                }
                            }
                            break;

                        case SearchSpinn3rType.User:
                            isContainSearch = searchHashSet.Contains(tweet.author_link.ToLower());
                            break;

                        default:
                            throw new ArgumentException();
                        }

                        if (isContainSearch)
                        {
                            var document = new Document();
                            document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            indexWriter.AddDocument(document);
                            docFoundCount++;
                        }
                        totalDocCount++;
                    }
                });
                progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount));
            }
            progress.PrintTotalTime();

            Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount);

            Console.WriteLine("Start writing index...");
            indexWriter.Commit();
            indexWriter.Close();

            Util.ProgramFinishHalt();
        }
Пример #14
0
        public void Start()
        {
            string inputPath  = @"D:\DataProcess\TweetIndex\tweets-Ebola-20150101-20150228_dedup\";
            string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter2\";

            var indexReader = LuceneOperations.GetIndexReader(inputPath);
            var indexWriter = LuceneOperations.GetIndexWriter(outputPath);

            char[]   seperator = new char[] { ' ' };
            string[] aidFields = new string[] { "User_FollowersCount", "User_Name", "User_ScreenName",
                                                "Retweet", "Mention" };
            ProgramProgress progress = new ProgramProgress(indexReader.NumDocs());

            //for (int iDoc = 0; iDoc < 1000; iDoc++)
            for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++)
            {
                Document inDoc  = indexReader.Document(iDoc);
                Document outDoc = new Document();

                string   inTime   = inDoc.Get("CreateAt");
                DateTime dateTime = DateTime.Parse(inTime);
                outDoc.Add(new Field(BingNewsFields.DiscoveryStringTime, dateTime.ToString(BingNewsFields.TimeFormat), Field.Store.YES, Field.Index.ANALYZED));

                string hashtag = inDoc.Get("Hashtag");
                string word    = inDoc.Get("Word");
                if (hashtag == null)
                {
                    hashtag = "";
                }
                var hashtagTokens = hashtag.Split(seperator, StringSplitOptions.RemoveEmptyEntries);
                var wordTokens    = word.Split(seperator, StringSplitOptions.RemoveEmptyEntries);

                string title = hashtagTokens.Length > 0 ? hashtagTokens[0] : wordTokens.Length > 0 ? wordTokens[0] : "";
                outDoc.Add(new Field(BingNewsFields.NewsArticleHeadline, title, Field.Store.YES, Field.Index.ANALYZED));

                outDoc.Add(new Field(BingNewsFields.NewsArticleDescription, inDoc.Get("Text"), Field.Store.YES, Field.Index.ANALYZED));

                string           featureVector = "";
                Counter <string> counter       = new Counter <string>();
                foreach (var tag in hashtagTokens)
                {
                    counter.Add(tag);
                    counter.Add(tag);
                }
                foreach (var w in wordTokens)
                {
                    counter.Add(w);
                }
                foreach (var kvp in counter.GetSortedCountDictioanry())
                {
                    featureVector += string.Format("{0}({1})\\n", kvp.Key, kvp.Value);
                }
                outDoc.Add(new Field(BingNewsFields.FeatureVector, featureVector, Field.Store.YES, Field.Index.ANALYZED));

                outDoc.Add(new Field(BingNewsFields.DocId, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                outDoc.Add(new Field(BingNewsFields.DocumentURL, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED));

                foreach (var aidField in aidFields)
                {
                    var value = inDoc.Get(aidField);
                    outDoc.Add(new Field(aidField, value == null ? "" : value, Field.Store.YES, Field.Index.ANALYZED));
                }

                indexWriter.AddDocument(outDoc);

                progress.PrintIncrementExperiment();
            }

            indexWriter.Optimize();
            indexWriter.Close();

            indexReader.Close();
        }