Beispiel #1
0
        public void Start()
        {
            var         reader = LuceneOperations.GetIndexReader(Configure.InputPath);
            var         sw     = new StreamWriter(Configure.OutputPath);
            IndexWriter writer = null;

            if (Configure.IsFilterByWordCount)
            {
                writer = LuceneOperations.GetIndexWriter(Configure.FilterWordCountIndexPath);
            }
            if (Configure.IsLoadFromFeatureVector)
            {
                Configure.TokenizeConfig.TokenizerType = TokenizerType.FeatureVector;
            }

            Console.WriteLine("Total: " + reader.NumDocs());
            int docIndex = 0;

            for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
            {
                if (iDoc % 10000 == 0)
                {
                    Console.WriteLine(iDoc);
                    sw.Flush();
                }

                string content = Configure.IsLoadFromFeatureVector ? reader.Document(iDoc).Get(BingNewsFields.FeatureVector) :
                                 LuceneOperations.GetDocumentContent(reader.Document(iDoc), Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);

                List <string> words      = NLPOperations.Tokenize(content, Configure.TokenizeConfig);;
                bool          isPrintDoc = !Configure.IsFilterByWordCount || words.Count >= Configure.MinWordCount;
                if (isPrintDoc)
                {
                    if (Configure.IsFilterByWordCount)
                    {
                        writer.AddDocument(reader.Document(iDoc));
                    }

                    sw.Write(docIndex + " " + docIndex + " ");

                    foreach (var word in words)
                    {
                        sw.Write(word + " ");
                    }
                    sw.Write("\n");

                    docIndex++;
                }
            }

            if (Configure.IsFilterByWordCount)
            {
                writer.Optimize();
                writer.Close();
            }

            sw.Flush();
            sw.Close();
            reader.Close();
        }
Beispiel #2
0
        /// <summary>
        /// Filter out tweets within a certain time range
        /// Output: *.filter.txt
        /// </summary>
        /// <param name="lucenePath">Lucene index folder path of tweets</param>
        /// <param name="fileName">Input file path and prefix of output file</param>
        /// <param name="minTimeStr">Lower bound of time range</param>
        /// <param name="maxTimeStr">Upper bound of time range</param>
        public static void filterTimeRange(string lucenePath, string fileName, string minTimeStr, string maxTimeStr)
        {
            var          indexReader = LuceneOperations.GetIndexReader(lucenePath);
            StreamReader sr          = new StreamReader(fileName, Encoding.Default);
            FileStream   fs          = new FileStream(fileName + ".filter.txt", FileMode.Create);
            StreamWriter sw          = new StreamWriter(fs, Encoding.Default);

            string line;

            while ((line = sr.ReadLine()) != null)
            {
                int      iDoc    = int.Parse(line);
                Document inDoc   = indexReader.Document(iDoc);
                string   timeStr = inDoc.Get("CreatedAt");
                DateTime time    = DateTime.Parse(timeStr);
                DateTime minTime = DateTime.Parse(minTimeStr);
                DateTime maxTime = DateTime.Parse(maxTimeStr);
                if (DateTime.Compare(time, minTime) > 0 && DateTime.Compare(time, maxTime) < 0)
                {
                    sw.WriteLine(iDoc);
                }
            }

            sw.Close();
            fs.Close();
            sr.Close();
        }
        public void StartTransformTweetIndexForStreamingRoseRiver()
        {
            string inputPath  = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01\";
            string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01_MOD\";

            var indexReader = LuceneOperations.GetIndexReader(inputPath);
            var indexWriter = LuceneOperations.GetIndexWriter(outputPath);

            string          docIDField = BingNewsFields.DocId;
            string          urlField   = BingNewsFields.DocumentURL;
            ProgramProgress progress   = new ProgramProgress(indexReader.NumDocs());

            for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++)
            {
                Document inDoc  = indexReader.Document(iDoc);
                Document outDoc = inDoc;

                outDoc.RemoveField(docIDField);
                outDoc.Add(new Field(docIDField, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED));

                outDoc.RemoveField(urlField);
                outDoc.Add(new Field(urlField, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED));

                indexWriter.AddDocument(inDoc);
                progress.PrintIncrementExperiment();
            }


            indexWriter.Optimize();
            indexWriter.Close();

            indexReader.Close();
        }
        public void Start()
        {
            var writer = LuceneOperations.GetIndexWriter(OutputPath);

            var totalDocCnt = 0;

            foreach (var inputPath in InputPaths)
            {
                var reader = LuceneOperations.GetIndexReader(inputPath);
                totalDocCnt += reader.NumDocs();
                reader.Close();
            }

            var progress = new ProgramProgress(totalDocCnt);

            foreach (var inputPath in InputPaths)
            {
                var reader = LuceneOperations.GetIndexReader(inputPath);
                for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
                {
                    writer.AddDocument(reader.Document(iDoc));
                    progress.PrintIncrementExperiment();
                }
                reader.Close();
            }

            writer.Optimize();
            writer.Close();
        }
        public static void AnalyzeFieldValues(string inputPath, string fieldName, Func <string, string> convertValueFunc = null)
        {
            if (convertValueFunc == null)
            {
                convertValueFunc = str => str;
            }

            string       fileName = StringOperations.EnsureFolderEnd(inputPath) + fieldName + ".txt";
            StreamWriter sw       = new StreamWriter(fileName);

            Counter <string> counter = new Counter <string>();
            var indexReader          = LuceneOperations.GetIndexReader(inputPath);

            for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++)
            {
                var doc   = indexReader.Document(iDoc);
                var value = doc.Get(fieldName);
                counter.Add(convertValueFunc(value));
            }
            foreach (var kvp in counter.GetCountDictionary().OrderBy(kvp => kvp.Key))
            {
                sw.WriteLine(kvp.Key + "\t\t" + kvp.Value);
                Console.WriteLine(kvp.Key + "\t\t" + kvp.Value);
            }

            sw.WriteLine("total: " + indexReader.NumDocs());
            sw.Flush();
            sw.Close();

            indexReader.Close();
            Console.ReadKey();
        }
Beispiel #6
0
        /// <summary>
        /// Calculate the average published time of each tweet cluster
        /// Output: clusterAverageTime.txt
        /// </summary>
        /// <param name="fileName">Lucene index folder path of tweets</param>
        public static void averageTime(string fileName)
        {
            var          indexReader = LuceneOperations.GetIndexReader(fileName);
            StreamReader sr          = new StreamReader("signalCluster.txt", Encoding.Default);
            StreamReader sr1         = new StreamReader("generalCluster.txt", Encoding.Default);
            FileStream   fs          = new FileStream("clusterAverageTime.txt", FileMode.Create);
            StreamWriter sw          = new StreamWriter(fs, Encoding.Default);

            string line;
            string line1;

            while ((line = sr.ReadLine()) != null && (line1 = sr1.ReadLine()) != null)
            {
                line  = sr.ReadLine();
                line1 = sr1.ReadLine();
                sr.ReadLine();
                sr1.ReadLine();

                string[]   iDocStrArray = Regex.Split(line, " ");
                List <int> iDocList     = new List <int>();
                for (int i = 0; i < iDocStrArray.Length - 1; i++)
                {
                    iDocList.Add(int.Parse(iDocStrArray[i]));
                }

                string[]   iDocStrArray1 = Regex.Split(line1, " ");
                List <int> iDocList1     = new List <int>();
                for (int i = 0; i < iDocStrArray1.Length - 1; i++)
                {
                    iDocList1.Add(int.Parse(iDocStrArray1[i]));
                }

                int    count = iDocList.Count + iDocList1.Count;
                double temp  = 0.0;
                for (int i = 0; i < iDocList.Count; i++)
                {
                    Document inDoc   = indexReader.Document(iDocList[i]);
                    string   timeStr = inDoc.Get("CreatedAt");
                    DateTime time    = DateTime.Parse(timeStr);
                    temp += (double)time.Ticks / count;
                }
                for (int i = 0; i < iDocList1.Count; i++)
                {
                    Document inDoc   = indexReader.Document(iDocList1[i]);
                    string   timeStr = inDoc.Get("CreatedAt");
                    DateTime time    = DateTime.Parse(timeStr);
                    temp += (double)time.Ticks / count;
                }
                DateTime timeAvg = new DateTime((long)temp);

                sw.WriteLine(timeAvg.ToString());
            }

            sw.Close();
            fs.Close();
            sr1.Close();
            sr.Close();
        }
Beispiel #7
0
        public static void VisualizeTree(IEnumerable <string> brtFiles, string luceneIndex = null, string[] keywords = null, bool isRemoveLeafNodes = true)
        {
            List <ITree> trees = new List <ITree>();

            foreach (var brtFile in brtFiles)
            {
                //Read tree from file
                TreeDataParser parser = new TreeDataParser(brtFile, isRemoveLeafNodes);
                var            tree   = parser.GetTree();
                Trace.WriteLine(tree.GetDepth(tree.Root));
                if (luceneIndex != null)
                {
                    var scheme = TreeNodeScheme.Get(tree.Graph.NodeTable);
                    scheme.SetIndexReader(LuceneOperations.GetIndexReader(luceneIndex));
                    scheme.SetBRTFileName(brtFile);
                }
                trees.Add(tree);
            }

            //Print analyze info
            DoubleStatistics depthStat        = new DoubleStatistics();
            DoubleStatistics internalNodeStat = new DoubleStatistics();

            foreach (var tree in trees)
            {
                depthStat.AddNumber(tree.BFS(tree.Root).Max(node =>
                {
                    int depth      = 0;
                    INode ancestor = node;
                    while ((ancestor = tree.GetParent(ancestor)) != null)
                    {
                        depth++;
                    }
                    return(depth);
                }) + 1);
                internalNodeStat.AddNumber(tree.BFS(tree.Root).Count());
            }
            Console.WriteLine(depthStat.ToString());
            Console.WriteLine(internalNodeStat.ToString());

            //Visualize tree
            Thread NetServer = new Thread(new ThreadStart(() =>
            {
                TreeVisualization treeVis = new TreeVisualization(trees, keywords);
            }));

            NetServer.SetApartmentState(ApartmentState.STA);
            NetServer.IsBackground = true;
            NetServer.Start();
            System.Windows.Threading.Dispatcher.Run();
        }
Beispiel #8
0
        public void Start()
        {
            string debugFileName = Configure.OutputPath + _debugFileName;

            if (File.Exists(debugFileName))
            {
                File.Delete(debugFileName);
            }

            var        reader = LuceneOperations.GetIndexReader(Configure.InputPath);
            List <int> docIDs = new List <int>();

            for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
            {
                docIDs.Add(iDoc);
            }

            if (Configure.IsRemoveSameURL)
            {
                Console.WriteLine("=====================RemoveSameURL=====================");
                docIDs = RemoveSameURLDocument(reader, docIDs);
            }

            if (Configure.IsRemoveSimilarContent)
            {
                for (int iGranu = 0; iGranu < Configure.RemoveDateGranularity.Length; iGranu++)
                {
                    int timeGranu = Configure.RemoveDateGranularity[iGranu];
                    int wordGranu = Configure.RemoveWordGranularity[iGranu];

                    Console.WriteLine("========Remove Similar Document: {0} out of {1}, Granu: {2} {3}========",
                                      iGranu, Configure.RemoveDateGranularity.Length, timeGranu, wordGranu);

                    docIDs = RemoveSimilarDocumentsGranu(reader, docIDs, timeGranu, wordGranu);
                }
            }

            var writer = LuceneOperations.GetIndexWriter(Configure.OutputPath);

            foreach (var docID in docIDs)
            {
                writer.AddDocument(reader.Document(docID));
            }

            writer.Optimize();
            writer.Close();
            reader.Close();

            Console.WriteLine("All done");
            //Console.ReadKey();
        }
Beispiel #9
0
        /// <summary>
        /// Extract the unigrams, bigrams and trigrams of signal tweets.
        /// Need executing method MatchSignal.match_ori() first.
        /// Preparing step for signal tweets clustering method cluster_ori().
        /// </summary>
        /// <param name="fileName">Lucene index folder path of tweets</param>
        /// <param name="gramsList">List of unigrams, bigrams and trigrams of signal tweets</param>
        /// <param name="rec2iDoc">Dictionary from 3-grams record list # to tweet ID #</param>
        /// <param name="iDoc2rec">Dictionary from tweet ID # to 3-grams record list #</param>
        public static void preCluster_ori(string fileName, List <List <HashSet <string> > > gramsList, Dictionary <int, int> rec2iDoc, Dictionary <int, int> iDoc2rec)
        {
            var          indexReader = LuceneOperations.GetIndexReader(fileName);
            StreamReader sr          = new StreamReader("signal.txt", Encoding.Default);

            string line;
            int    recNum = 0;

            while ((line = sr.ReadLine()) != null)
            {
                int      iDoc  = int.Parse(line);
                Document inDoc = indexReader.Document(iDoc);
                string   text  = inDoc.Get("Text").ToLower();
                text = Regex.Replace(text, @"\s+", " ");
                text = Regex.Replace(text, @"[^A-Za-z0-9_ ]+", "");

                string[] gramArray             = Regex.Split(text, " ");
                List <HashSet <string> > grams = new List <HashSet <string> >();

                HashSet <string> unigram = new HashSet <string>();
                for (int i = 0; i < gramArray.Length; i++)
                {
                    unigram.Add(gramArray[i]);
                }
                grams.Add(unigram);

                HashSet <string> bigram = new HashSet <string>();
                for (int i = 0; i < gramArray.Length - 1; i++)
                {
                    bigram.Add(gramArray[i] + " " + gramArray[i + 1]);
                }
                grams.Add(bigram);

                HashSet <string> trigram = new HashSet <string>();
                for (int i = 0; i < gramArray.Length - 2; i++)
                {
                    trigram.Add(gramArray[i] + " " + gramArray[i + 1] + " " + gramArray[i + 2]);
                }
                grams.Add(trigram);

                if (recNum % 1000 == 0)
                {
                    Console.WriteLine(recNum);
                }
                gramsList.Add(grams);
                rec2iDoc.Add(recNum, iDoc);
                iDoc2rec.Add(iDoc, recNum);
                recNum++;
            }
            sr.Close();
        }
Beispiel #10
0
        /// <summary>
        /// Match rumor patterns to find signal tweets
        /// Preparing step for method ClusterSignal.preCluster_ori()
        /// Output: signal.txt
        /// </summary>
        /// <param name="fileName">Lucene index folder path of tweets</param>
        public static void match_ori(string fileName)
        {
            var          indexReader = LuceneOperations.GetIndexReader(fileName);
            FileStream   fs          = new FileStream("signal.txt", FileMode.Create);
            StreamWriter sw          = new StreamWriter(fs, Encoding.Default);

            for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++)
            {
                Document inDoc = indexReader.Document(iDoc);
                string   text  = inDoc.Get("Text").ToLower();
                if (Regex.IsMatch(text, @"is (this|that|it) true"))
                {
                    sw.WriteLine(iDoc);
                    continue;
                }
                if (Regex.IsMatch(text, @"(^|[^A-Za-z] )wh(a*)t([\?!]+)"))
                {
                    sw.WriteLine(iDoc);
                    continue;
                }
                if (Regex.IsMatch(text, @"(real\?|really\?|unconfirmed)"))
                {
                    sw.WriteLine(iDoc);
                    continue;
                }
                if (Regex.IsMatch(text, @"(rumor|debunk)"))
                {
                    sw.WriteLine(iDoc);
                    continue;
                }
                if (Regex.IsMatch(text, @"(that|this|it) is not true"))
                {
                    sw.WriteLine(iDoc);
                    continue;
                }
                if (iDoc % 100000 == 0)
                {
                    Console.WriteLine(iDoc);
                }
            }

            sw.Close();
            fs.Close();
        }
Beispiel #11
0
        public void AnalyzeDocuments()
        {
            string fileName  = @"D:\Project\TopicPanorama\data\TopicGraphs\NewCode-Ebola-Test2\Raw\news\result\lda.top.json";
            string indexPath = @"D:\DataProcess\Index\Raw_EbolaEnBingNews_Ebola_0_1_RS_R-1";

            int topDocCnt = 20;

            var indexReader = LuceneOperations.GetIndexReader(indexPath);

            //Read from json and sort
            SimpleJsonReader reader = new SimpleJsonReader(new StreamReader(File.Open(fileName, FileMode.Open)));

            HeapSortDouble[] hsd        = null;
            int             topicNumber = -1;
            ProgramProgress progress    = new ProgramProgress(indexReader.NumDocs());

            while (reader.IsReadable)
            {
                int      docID      = int.Parse(reader.ReadPropertyName());
                double[] topicArray = reader.ReadDoubleArray();

                if (topicNumber < 0)
                {
                    topicNumber = topicArray.Length;
                    hsd         = new HeapSortDouble[topicNumber];
                    for (int i = 0; i < topicNumber; i++)
                    {
                        hsd[i] = new HeapSortDouble(topDocCnt);
                    }
                }

                for (int i = 0; i < topicNumber; i++)
                {
                    hsd[i].Insert(docID, topicArray[i]);
                }
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            //Statistics


            Console.ReadLine();
        }
Beispiel #12
0
        /// <summary>
        /// Output representative tweet text of each tweet cluster
        /// Need executing selectRepresentative() first
        /// Output: clusterRepOriginalText.txt
        /// </summary>
        /// <param name="fileName">Lucene index folder path of tweets</param>
        public static void ouputRepresentativeOriginalText(string fileName)
        {
            var          indexReader = LuceneOperations.GetIndexReader(fileName);
            StreamReader sr          = new StreamReader("clusterRepIDoc.txt", Encoding.Default);
            FileStream   fs          = new FileStream("clusterRepOriginalText.txt", FileMode.Create);
            StreamWriter sw          = new StreamWriter(fs, Encoding.Default);

            string line;

            while ((line = sr.ReadLine()) != null)
            {
                Document inDoc = indexReader.Document(int.Parse(line));
                string   text  = inDoc.Get("Text");
                text = Regex.Replace(text, @"#N#", "");
                text = Regex.Replace(text, @"#n#", "");
                text = Regex.Replace(text, @"\s+", " ");
                sw.WriteLine(text);
            }

            sw.Close();
            fs.Close();
        }
Beispiel #13
0
        public void Start()
        {
            var             reader   = LuceneOperations.GetIndexReader(Configure.InputPath);
            var             docNum   = reader.NumDocs();
            ProgramProgress progress = new ProgramProgress(docNum);

            XmlDoc[] xmlDocs = new XmlDoc[docNum];
            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                var doc = reader.Document(iDoc);
                xmlDocs[iDoc] = new XmlDoc(doc);
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            //序列化这个对象
            XmlSerializer serializer = new XmlSerializer(typeof(XmlDocCollection));

            ////将对象序列化输出到控制台
            serializer.Serialize(new StreamWriter(Configure.OutputPath), new XmlDocCollection()
            {
                XmlDocs = xmlDocs
            });
        }
Beispiel #14
0
        /// <summary>
        /// Output name entity set of each tweet cluster
        /// Output: clusterNameEntitySet.txt
        /// </summary>
        /// <param name="fileName">Lucene index folder path of tweets</param>
        public static void nameEntitySet(string fileName)
        {
            var          indexReader = LuceneOperations.GetIndexReader(fileName);
            StreamReader sr          = new StreamReader("signalCluster.txt", Encoding.Default);
            StreamReader sr1         = new StreamReader("generalCluster.txt", Encoding.Default);
            FileStream   fs          = new FileStream("clusterNameEntitySet.txt", FileMode.Create);
            StreamWriter sw          = new StreamWriter(fs, Encoding.Default);

            // Path to the folder with classifiers models
            var jarRoot = @"..\..\..\..\stanford-ner-2015-12-09";
            var classifiersDirecrory = jarRoot + @"\classifiers";

            // Loading 3 class classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            string line;
            string line1;

            while ((line = sr.ReadLine()) != null && (line1 = sr1.ReadLine()) != null)
            {
                line  = sr.ReadLine();
                line1 = sr1.ReadLine();
                sr.ReadLine();
                sr1.ReadLine();

                string[]   iDocStrArray = Regex.Split(line, " ");
                List <int> iDocList     = new List <int>();
                for (int i = 0; i < iDocStrArray.Length - 1; i++)
                {
                    iDocList.Add(int.Parse(iDocStrArray[i]));
                }

                string[]   iDocStrArray1 = Regex.Split(line1, " ");
                List <int> iDocList1     = new List <int>();
                for (int i = 0; i < iDocStrArray1.Length - 1; i++)
                {
                    iDocList1.Add(int.Parse(iDocStrArray1[i]));
                }

                HashSet <string> nameEntitySet = new HashSet <string>();

                for (int i = 0; i < iDocList.Count; i++)
                {
                    Document inDoc = indexReader.Document(iDocList[i]);
                    string   text  = inDoc.Get("Text");
                    text = Regex.Replace(text, @"\s+", " ");
                    text = Regex.Replace(text, @"#n#|#N#", "");
                    text = Regex.Replace(text, @"#", "");
                    text = Regex.Replace(text, @"@", "");
                    text = classifier.classifyWithInlineXML(text);
                    MatchCollection mc;
                    mc = Regex.Matches(text, @"<PERSON>[^<>]+</PERSON>");
                    var it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        nameEntitySet.Add(str.Substring(8, str.Length - 17));
                    }
                    mc = Regex.Matches(text, @"<ORGANIZATION>[^<>]+</ORGANIZATION>");
                    it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        nameEntitySet.Add(str.Substring(14, str.Length - 29));
                    }
                    mc = Regex.Matches(text, @"<LOCATION>[^<>]+</LOCATION>");
                    it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        nameEntitySet.Add(str.Substring(10, str.Length - 21));
                    }
                }

                for (int i = 0; i < iDocList1.Count; i++)
                {
                    Document inDoc = indexReader.Document(iDocList1[i]);
                    string   text  = inDoc.Get("Text");
                    text = Regex.Replace(text, @"\s+", " ");
                    text = Regex.Replace(text, @"#n#|#N#", "");
                    text = Regex.Replace(text, @"#", "");
                    text = Regex.Replace(text, @"@", "");
                    text = classifier.classifyWithInlineXML(text);
                    MatchCollection mc;
                    mc = Regex.Matches(text, @"<PERSON>[^<>]+</PERSON>");
                    var it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        nameEntitySet.Add(str.Substring(8, str.Length - 17));
                    }
                    mc = Regex.Matches(text, @"<ORGANIZATION>[^<>]+</ORGANIZATION>");
                    it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        nameEntitySet.Add(str.Substring(14, str.Length - 29));
                    }
                    mc = Regex.Matches(text, @"<LOCATION>[^<>]+</LOCATION>");
                    it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        nameEntitySet.Add(str.Substring(10, str.Length - 21));
                    }
                }

                var iter = nameEntitySet.GetEnumerator();
                for (int i = 0; i < nameEntitySet.Count; i++)
                {
                    iter.MoveNext();
                    sw.Write(iter.Current.ToString() + "; ");
                }

                sw.WriteLine();
            }

            sw.Close();
            fs.Close();
            sr1.Close();
            sr.Close();
        }
        public static void AnalyzeTwitterWordDistribution(string inputPath, TokenizeConfig tokenConfig)
        {
            var indexReader = LuceneOperations.GetIndexReader(inputPath);
            var docNum      = indexReader.NumDocs();

            int[] docWordCnt     = new int[docNum];
            int[] docUniqWordCnt = new int[docNum];
            Dictionary <string, int> wordDocCntDict = new Dictionary <string, int>();
            Dictionary <string, int> wordOccCntDict = new Dictionary <string, int>();

            var fieldWeights = tokenConfig.TokenizerType == TokenizerType.FeatureVector
                ? BingNewsFields.FeatureVectorFieldWeights
                : BingNewsFields.NewsFieldWeights;

            ProgramProgress progress = new ProgramProgress(docNum);

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                var document = indexReader.Document(iDoc);
                var content  = LuceneOperations.GetContent(document, fieldWeights);

                var words       = NLPOperations.Tokenize(content, tokenConfig);
                var uniqueWords = new HashSet <string>(words);
                docWordCnt[iDoc]     = words.Count;
                docUniqWordCnt[iDoc] = uniqueWords.Count;

                foreach (var word in uniqueWords)
                {
                    if (!wordDocCntDict.ContainsKey(word))
                    {
                        wordDocCntDict.Add(word, 0);
                    }
                    wordDocCntDict[word]++;
                }

                foreach (var word in words)
                {
                    if (!wordOccCntDict.ContainsKey(word))
                    {
                        wordOccCntDict.Add(word, 0);
                    }
                    wordOccCntDict[word]++;
                }

                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            indexReader.Close();

            //Statistics
            DoubleStatistics statDocWordCnt     = new DoubleStatistics();
            DoubleStatistics statDocUniqWordCnt = new DoubleStatistics();
            DoubleStatistics statWordDocCnt     = new DoubleStatistics();
            DoubleStatistics statWordOccCnt     = new DoubleStatistics();

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                statDocWordCnt.AddNumber(docWordCnt[iDoc]);
                statDocUniqWordCnt.AddNumber(docUniqWordCnt[iDoc]);
            }

            foreach (var kvp in wordDocCntDict)
            {
                statWordDocCnt.AddNumber(kvp.Value);
            }

            foreach (var kvp in wordOccCntDict)
            {
                statWordOccCnt.AddNumber(kvp.Value);
            }


            Console.WriteLine(statDocWordCnt.ToString("statDocWordCnt"));
            Console.WriteLine(statDocUniqWordCnt.ToString("statDocUniqWordCnt"));
            Console.WriteLine(statWordDocCnt.ToString("statWordDocCnt"));
            Console.WriteLine(statWordOccCnt.ToString("wordOccCnt"));

            //Hist
            var docWordCntHist       = new DoubleHistogram(docWordCnt.Select(i => (double)i), (double)1);
            var docUniqueWordCntList = new DoubleHistogram(docUniqWordCnt.Select(i => (double)i), (double)1);
            var wordDocCntHist       = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), 1000);
            var wordDocCntHist2      = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), (double)1);

            docWordCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docWordCntHist.csv");
            docUniqueWordCntList.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docUniqueWordCntList.csv");
            wordDocCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist.csv");
            wordDocCntHist2.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist2.csv");

            Console.Read();
        }
        public void Start()
        {
            Initialize();

            var reader = LuceneOperations.GetIndexReader(Configure.InputPath);

            InitializeWriters();

            var docNum   = reader.NumDocs();
            var progress = new ProgramProgress(docNum);

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                var  doc    = reader.Document(iDoc);
                bool isSkip = false;

                //random sample
                if (!isSkip && Configure.IsSampling)
                {
                    if (Random.NextDouble() > Configure.SampleRatio)
                    {
                        isSkip = true;
                    }
                }

                //filter by time
                if (!isSkip && Configure.IsSelectByTime)
                {
                    var dateTime = StringOperations.ParseDateTimeString(
                        doc.Get(Configure.TimeField), Configure.ParseTimeFormat);
                    if (dateTime.Subtract(StartDateTime).Ticks < 0 ||
                        dateTime.Subtract(EndDateTime).Ticks > 0)
                    {
                        isSkip = true;
                    }
                }

                //filter by exact match
                if (!isSkip && Configure.IsSelectByExactMatch)
                {
                    foreach (var kvp in Configure.FieldMatchDict)
                    {
                        if (doc.Get(kvp.Key) != kvp.Value)
                        {
                            isSkip = true;
                            break;
                        }
                    }
                }

                if (!isSkip)
                {
                    GetWriter(doc).AddDocument(doc);
                }

                progress.PrintIncrementExperiment();
            }

            CloseWriters();

            reader.Close();
        }
        public void Start()
        {
            string inputPath  = @"D:\DataProcess\TweetIndex\tweets-Ebola-20150101-20150228_dedup\";
            string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter2\";

            var indexReader = LuceneOperations.GetIndexReader(inputPath);
            var indexWriter = LuceneOperations.GetIndexWriter(outputPath);

            char[]   seperator = new char[] { ' ' };
            string[] aidFields = new string[] { "User_FollowersCount", "User_Name", "User_ScreenName",
                                                "Retweet", "Mention" };
            ProgramProgress progress = new ProgramProgress(indexReader.NumDocs());

            //for (int iDoc = 0; iDoc < 1000; iDoc++)
            for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++)
            {
                Document inDoc  = indexReader.Document(iDoc);
                Document outDoc = new Document();

                string   inTime   = inDoc.Get("CreateAt");
                DateTime dateTime = DateTime.Parse(inTime);
                outDoc.Add(new Field(BingNewsFields.DiscoveryStringTime, dateTime.ToString(BingNewsFields.TimeFormat), Field.Store.YES, Field.Index.ANALYZED));

                string hashtag = inDoc.Get("Hashtag");
                string word    = inDoc.Get("Word");
                if (hashtag == null)
                {
                    hashtag = "";
                }
                var hashtagTokens = hashtag.Split(seperator, StringSplitOptions.RemoveEmptyEntries);
                var wordTokens    = word.Split(seperator, StringSplitOptions.RemoveEmptyEntries);

                string title = hashtagTokens.Length > 0 ? hashtagTokens[0] : wordTokens.Length > 0 ? wordTokens[0] : "";
                outDoc.Add(new Field(BingNewsFields.NewsArticleHeadline, title, Field.Store.YES, Field.Index.ANALYZED));

                outDoc.Add(new Field(BingNewsFields.NewsArticleDescription, inDoc.Get("Text"), Field.Store.YES, Field.Index.ANALYZED));

                string           featureVector = "";
                Counter <string> counter       = new Counter <string>();
                foreach (var tag in hashtagTokens)
                {
                    counter.Add(tag);
                    counter.Add(tag);
                }
                foreach (var w in wordTokens)
                {
                    counter.Add(w);
                }
                foreach (var kvp in counter.GetSortedCountDictioanry())
                {
                    featureVector += string.Format("{0}({1})\\n", kvp.Key, kvp.Value);
                }
                outDoc.Add(new Field(BingNewsFields.FeatureVector, featureVector, Field.Store.YES, Field.Index.ANALYZED));

                outDoc.Add(new Field(BingNewsFields.DocId, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                outDoc.Add(new Field(BingNewsFields.DocumentURL, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED));

                foreach (var aidField in aidFields)
                {
                    var value = inDoc.Get(aidField);
                    outDoc.Add(new Field(aidField, value == null ? "" : value, Field.Store.YES, Field.Index.ANALYZED));
                }

                indexWriter.AddDocument(outDoc);

                progress.PrintIncrementExperiment();
            }

            indexWriter.Optimize();
            indexWriter.Close();

            indexReader.Close();
        }
Beispiel #18
0
        public void Start()
        {
            if (!Configure.InputPath.EndsWith("\\"))
            {
                Configure.InputPath += "\\";
            }
            var reader     = LuceneOperations.GetIndexReader(Configure.InputPath);
            var docNum     = reader.NumDocs();
            var docNumPart = docNum / 100;

            Console.WriteLine("Total: " + docNum);

            Random random = new Random(Configure.SampleSeed == -1 ? (int)DateTime.Now.Ticks : Configure.SampleSeed);

            //Topwords
            var counter = new Counter <string>();

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                if (iDoc % docNumPart == 0)
                {
                    Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%");
                }
                if (random.NextDouble() > Configure.SampleRatio)
                {
                    continue;
                }

                var doc     = reader.Document(iDoc);
                var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);
                var words   = NLPOperations.Tokenize(content, Configure.TokenizeConfig);
                foreach (var word in words)
                {
                    counter.Add(word);
                }
            }
            var topwords        = counter.GetMostFreqObjs(Configure.TopWordCount);
            var wordCounterDict = counter.GetCountDictionary();

            var swTopWords = new StreamWriter(Configure.InputPath + "TopWords.txt");

            foreach (var topword in topwords)
            {
                swTopWords.WriteLine(topword);
            }
            swTopWords.Flush();
            swTopWords.Close();

            //CoOccurrence
            if (Configure.IsPrintCooccurrence)
            {
                var k = topwords.Count;
                var occurCounterDict = new Dictionary <string, Counter <string> >();
                foreach (var topword in topwords)
                {
                    occurCounterDict.Add(topword, new Counter <string>());
                }
                for (int iDoc = 0; iDoc < docNum; iDoc++)
                {
                    if (iDoc % docNumPart == 0)
                    {
                        Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%");
                    }
                    if (random.NextDouble() > Configure.SampleRatio)
                    {
                        continue;
                    }

                    var doc     = reader.Document(iDoc);
                    var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);
                    var words   = Util.GetHashSet(NLPOperations.Tokenize(content, Configure.TokenizeConfig));
                    foreach (var word in words)
                    {
                        if (occurCounterDict.ContainsKey(word))
                        {
                            var occurCounter = occurCounterDict[word];
                            foreach (var word2 in words)
                            {
                                if (word2 == word)
                                {
                                    continue;
                                }
                                if (occurCounterDict.ContainsKey(word2))
                                {
                                    occurCounter.Add(word2);
                                }
                            }
                        }
                    }
                }
                var heapSort = new HeapSortDouble(Configure.TopOccurrenceCount);
                var pairDict = new Dictionary <int, Tuple <string, string> >();
                var iPair    = 0;
                foreach (var kvp in occurCounterDict)
                {
                    var word         = kvp.Key;
                    var occurCounter = kvp.Value;
                    foreach (var kvp2 in occurCounter.GetCountDictionary())
                    {
                        heapSort.Insert(iPair, kvp2.Value);
                        pairDict.Add(iPair, new Tuple <string, string>(word, kvp2.Key));
                        iPair++;
                    }
                }

                var swCoOccurrence = new StreamWriter(Configure.InputPath + "CoOccurrence.txt");
                foreach (var kvp in heapSort.GetSortedDictionary())
                {
                    var pair = pairDict[kvp.Key];
                    swCoOccurrence.WriteLine("{0} - {1}\t{2}",
                                             pair.Item1, pair.Item2, kvp.Value);
                }

                swCoOccurrence.Flush();
                swCoOccurrence.Close();
            }

            reader.Close();
        }
Beispiel #19
0
        /// <summary>
        /// Rank general clusters with naive algorithm to find the most likely rumors
        /// Output: rankCluster.txt
        /// </summary>
        /// <param name="fileName">Lucene index folder path of tweets</param>
        /// <param name="rList">List of tweet ID # list of signal tweets in each tweet cluster</param>
        /// <param name="gList">List of tweet ID # list of non-signal tweets in each tweet cluster</param>
        public static void rank_naive(string fileName, List <List <int> > rList, List <List <int> > gList)
        {
            StreamReader sr = new StreamReader("generalCluster.txt", Encoding.Default);
            string       line;

            while ((line = sr.ReadLine()) != null)
            {
                line = sr.ReadLine();
                sr.ReadLine();
                string[]   iDocStrArray = Regex.Split(line, " ");
                List <int> iDocList     = new List <int>();
                if (iDocStrArray == null)
                {
                    gList.Add(iDocList);
                    continue;
                }
                for (int i = 0; i < iDocStrArray.Length - 1; i++)
                {
                    iDocList.Add(int.Parse(iDocStrArray[i]));
                }
                gList.Add(iDocList);
            }
            sr.Close();

            List <ScoreRec> scoreList   = new List <ScoreRec>();
            var             indexReader = LuceneOperations.GetIndexReader(fileName);

            MatchCollection mc;
            int             count;

            for (int i = 0; i < gList.Count; i++)
            {
                if (i % 10 == 0)
                {
                    Console.WriteLine(i);
                }
                double score            = 0.0;
                double count_popularity = 0.2 * Math.Log10((double)(rList[i].Count + gList[i].Count));
                double count_signal     = 0.3 * (double)rList[i].Count / (double)(rList[i].Count + gList[i].Count);
                double count_url        = 0.0;
                double count_mention    = 0.0;
                double count_length     = 0.0;

                for (int j = 0; j < rList[i].Count; j++)
                {
                    int      iDoc  = rList[i][j];
                    Document inDoc = indexReader.Document(iDoc);
                    string   text  = inDoc.Get("Text").ToLower();

                    mc    = Regex.Matches(text, @"http:");
                    count = mc.Count;
                    if (count > 2)
                    {
                        count = 2;
                    }
                    count_url += count;

                    mc    = Regex.Matches(text, @"@");
                    count = mc.Count;
                    if (count > 5)
                    {
                        count = 5;
                    }
                    count_mention += count;

                    text = Regex.Replace(text, @"\s+", " ");
                    text = Regex.Replace(text, @"[^A-Za-z0-9_ ]+", "");
                    string[] gramArray = Regex.Split(text, " ");
                    count_length += gramArray.Length;
                }

                for (int j = 0; j < gList[i].Count; j++)
                {
                    int      iDoc  = gList[i][j];
                    Document inDoc = indexReader.Document(iDoc);
                    string   text  = inDoc.Get("Text").ToLower();

                    mc    = Regex.Matches(text, @"http:");
                    count = mc.Count;
                    if (count > 2)
                    {
                        count = 2;
                    }
                    count_url += count;

                    mc    = Regex.Matches(text, @"@");
                    count = mc.Count;
                    if (count > 5)
                    {
                        count = 5;
                    }
                    count_mention += count;

                    text = Regex.Replace(text, @"\s+", " ");
                    text = Regex.Replace(text, @"[^A-Za-z0-9_ ]+", "");
                    string[] gramArray = Regex.Split(text, " ");
                    count_length += gramArray.Length;
                }

                count_url     /= (double)(rList[i].Count + gList[i].Count);
                count_mention /= (double)(rList[i].Count + gList[i].Count);
                count_length  /= (double)(rList[i].Count + gList[i].Count);

                count_url     = (2 - count_url) * 0.1;
                count_mention = (5 - count_mention) * 0.05;
                count_length  = (140 / count_length > 10 ? 10 : 140 / count_length) * 0.02;

                score = count_popularity + count_signal + count_url + count_mention + count_length;
                scoreList.Add(new ScoreRec(score, i));
            }
            scoreList.Sort(new ScoreRecComparer());
            FileStream   fs = new FileStream("rankCluster.txt", FileMode.Create);
            StreamWriter sw = new StreamWriter(fs, Encoding.Default);

            for (int i = 0; i < gList.Count; i++)
            {
                Console.WriteLine(i + ": " + scoreList[i].score + " " + scoreList[i].rec);
                sw.WriteLine(i + ": " + scoreList[i].score + " " + scoreList[i].rec);
            }
            sw.Close();
            fs.Close();
        }
Beispiel #20
0
        /// <summary>
        /// Calculate mention similarity matrix of tweet clusters
        /// </summary>
        public static void mentionSimilarity(string fileName)
        {
            var          indexReader = LuceneOperations.GetIndexReader(fileName);
            StreamReader sr          = new StreamReader("signalCluster.txt", Encoding.Default);
            StreamReader sr1         = new StreamReader("generalCluster.txt", Encoding.Default);
            FileStream   fs          = new FileStream("clusterMentionSimilarity.txt", FileMode.Create);
            StreamWriter sw          = new StreamWriter(fs, Encoding.Default);

            var    mentionList = new List <HashSet <string> >();
            string line;

            while ((line = sr.ReadLine()) != null)
            {
                line = sr.ReadLine();
                sr.ReadLine();
                string[]   iDocStrArray = Regex.Split(line, " ");
                List <int> iDocList     = new List <int>();
                for (int i = 0; i < iDocStrArray.Length - 1; i++)
                {
                    iDocList.Add(int.Parse(iDocStrArray[i]));
                }
                sr1.ReadLine();
                line = sr1.ReadLine();
                sr1.ReadLine();
                iDocStrArray = Regex.Split(line, " ");
                for (int i = 0; i < iDocStrArray.Length - 1; i++)
                {
                    iDocList.Add(int.Parse(iDocStrArray[i]));
                }

                var mention = new HashSet <string>();
                for (int i = 0; i < iDocList.Count; i++)
                {
                    Document inDoc       = indexReader.Document(iDocList[i]);
                    string   userSrnName = inDoc.Get("UserScreenName");
                    mention.Add(userSrnName);
                    string          text = inDoc.Get("Text");
                    MatchCollection mc;
                    mc = Regex.Matches(text, @"@[A-Za-z0-9_]+");
                    var it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        string str = it.Current.ToString();
                        mention.Add(str.Substring(1));
                    }
                }
                mentionList.Add(mention);
            }

            for (int i = 0; i < mentionList.Count; i++)
            {
                var mention1 = mentionList[i];
                for (int j = 0; j < mentionList.Count; j++)
                {
                    var mention2 = mentionList[j];
                    int sim      = 0;
                    foreach (var name in mention1)
                    {
                        if (mention2.Contains(name))
                        {
                            sim = 1;
                            break;
                        }
                    }
                    sw.Write(sim + " ");
                }
                sw.WriteLine();
            }

            sw.Close();
            fs.Close();
            sr1.Close();
            sr.Close();
        }
Beispiel #21
0
        /// <summary>
        /// Select a representative tweet for each tweet cluster
        /// Output: clusterRepIDoc.txt, clusterRepText.txt, clusterRepWords.txt
        /// </summary>
        /// <param name="fileName">Lucene index folder path of tweets</param>
        /// <param name="gramsList">List of 3-grams sets of signal tweets in each signal tweet cluster</param>
        /// <param name="iDoc2rec">Dictionary from tweet ID # to 3-grams record list #</param>
        public static void selectRepresentative(string fileName, List <List <HashSet <string> > > gramsList, Dictionary <int, int> iDoc2rec)
        {
            var          indexReader = LuceneOperations.GetIndexReader(fileName);
            StreamReader sr          = new StreamReader("signalCluster.txt", Encoding.Default);
            FileStream   fs          = new FileStream("clusterRepIDoc.txt", FileMode.Create);
            StreamWriter sw          = new StreamWriter(fs, Encoding.Default);
            FileStream   fs1         = new FileStream("clusterRepText.txt", FileMode.Create);
            StreamWriter sw1         = new StreamWriter(fs1, Encoding.Default);
            FileStream   fs2         = new FileStream("clusterRepWords.txt", FileMode.Create);
            StreamWriter sw2         = new StreamWriter(fs2, Encoding.Default);

            string line;

            while ((line = sr.ReadLine()) != null)
            {
                line = sr.ReadLine();
                sr.ReadLine();
                string[]   iDocStrArray = Regex.Split(line, " ");
                List <int> iDocList     = new List <int>();
                for (int i = 0; i < iDocStrArray.Length - 1; i++)
                {
                    iDocList.Add(int.Parse(iDocStrArray[i]));
                }

                double[] simArr = new double[iDocList.Count];
                for (int i = 0; i < iDocList.Count; i++)
                {
                    simArr[i] = 0.0;
                }

                for (int i = 0; i < iDocList.Count; i++)
                {
                    int rec1 = iDoc2rec[iDocList[i]];
                    for (int j = i + 1; j < iDocList.Count; j++)
                    {
                        int    rec2 = iDoc2rec[iDocList[j]];
                        double sim  = ClusterGeneral.jaccard(gramsList[rec1], gramsList[rec2]);
                        simArr[i] += sim;
                        simArr[j] += sim;
                    }
                }

                if (iDocList.Count > 1)
                {
                    for (int i = 0; i < iDocList.Count; i++)
                    {
                        simArr[i] /= (iDocList.Count - 1);
                    }
                }

                double maxSim      = -1.0;
                int    maxSimIndex = -1;
                for (int i = 0; i < iDocList.Count; i++)
                {
                    if (simArr[i] > maxSim)
                    {
                        maxSim      = simArr[i];
                        maxSimIndex = i;
                    }
                }

                int      iDoc  = iDocList[maxSimIndex];
                Document inDoc = indexReader.Document(iDoc);
                string   text  = inDoc.Get("Text").ToLower();
                text = Regex.Replace(text, @"\s+", " ");
                text = Regex.Replace(text, @"#n#", "");
                string words = Regex.Replace(text, @"[^A-Za-z0-9_ ]+", "");
                sw.WriteLine(iDoc);
                sw1.WriteLine(text);
                sw2.WriteLine(words);
            }

            sw2.Close();
            fs2.Close();
            sw1.Close();
            fs1.Close();
            sw.Close();
            fs.Close();
            sr.Close();
        }
Beispiel #22
0
        public static void Test()
        {
            string indexPath = @"C:\Users\v-xitwan\Desktop\temp\WeiboIndex\WeiboSortByHotIndex_Time_RemoveNoise2_RemoveSimilar2";
            var    reader    = LuceneOperations.GetIndexReader(indexPath);
            //var keywords = new string[]{"街","信","死","女","清","刷","骂","愿","爱","查","舰","版","通","岁","撕"};

            //foreach (var keyword in keywords)
            {
                var sw = new StreamWriter(@"C:\Users\v-xitwan\Desktop\temp\WeiboIndex\TestTokenizer" + "Stat" + ".txt", false,
                                          Encoding.UTF8);
                //ChineseWordBreaker chineseWordBreaker = new ChineseWordBreaker(@"Utils\Lib\WordBreaker\");
                int cnt1 = 0, cnt2 = 0;
                int cnt1all = 0, cnt2all = 0;

                for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
                {
                    string sentence = reader.Document(iDoc).Get("NewsArticleDescription");

                    var words1 = NLPOperations.Tokenize(sentence, new TokenizeConfig(TokenizerType.ICTCLAS, StopWordsFile.CH));
                    var words2 = NLPOperations.Tokenize(sentence, new TokenizeConfig(TokenizerType.ChineseWordBreaker, StopWordsFile.CH));

                    //bool isPrint = false;
                    //foreach (var word in words1)
                    //    if (word.Length == 1)
                    //    {
                    //        isPrint = true;
                    //        cnt1++;
                    //    }
                    //foreach (var word in words2)
                    //    if (word.Length == 2)
                    //    {
                    //        isPrint = true;
                    //        cnt2++;
                    //    }
                    cnt1all += words1.Count;
                    cnt2all += words2.Count;

                    //if (isPrint)
                    //{
                    //    sw.WriteLine("-------------{0}-------------", iDoc);
                    //    sw.WriteLine(sentence);
                    //    sw.WriteLine("[ICT]\t" + StringOperations.GetMergedString(words1));
                    //    sw.WriteLine("[CWB]\t" + StringOperations.GetMergedString(words2));

                    //    sw.WriteLine("[ICT--]\t" + Marshal.PtrToStringAnsi(NLPIR_ParagraphProcess(sentence, 1)));
                    //    //sw.WriteLine("[CWB--]\t" + chineseWordBreaker.GetResult(sentence));
                    //    sw.WriteLine();

                    //    sw.Flush();
                    //}
                }

                sw.WriteLine("cnt1 = " + cnt1);
                sw.WriteLine("cnt2 = " + cnt2);
                sw.WriteLine("cnt1all = " + cnt1all);
                sw.WriteLine("cnt2all = " + cnt2all);

                sw.Flush();
                sw.Close();
            }
        }
Beispiel #23
0
        /// <summary>
        /// Output hashtag set of each tweet cluster
        /// Output: clusterHashtagSet.txt
        /// </summary>
        /// <param name="fileName">Lucene index folder path of tweets</param>
        public static void hashtagSet(string fileName)
        {
            var          indexReader = LuceneOperations.GetIndexReader(fileName);
            StreamReader sr          = new StreamReader("signalCluster.txt", Encoding.Default);
            StreamReader sr1         = new StreamReader("generalCluster.txt", Encoding.Default);
            FileStream   fs          = new FileStream("clusterHashtagSet.txt", FileMode.Create);
            StreamWriter sw          = new StreamWriter(fs, Encoding.Default);

            string line;
            string line1;

            while ((line = sr.ReadLine()) != null && (line1 = sr1.ReadLine()) != null)
            {
                line  = sr.ReadLine();
                line1 = sr1.ReadLine();
                sr.ReadLine();
                sr1.ReadLine();

                string[]   iDocStrArray = Regex.Split(line, " ");
                List <int> iDocList     = new List <int>();
                for (int i = 0; i < iDocStrArray.Length - 1; i++)
                {
                    iDocList.Add(int.Parse(iDocStrArray[i]));
                }

                string[]   iDocStrArray1 = Regex.Split(line1, " ");
                List <int> iDocList1     = new List <int>();
                for (int i = 0; i < iDocStrArray1.Length - 1; i++)
                {
                    iDocList1.Add(int.Parse(iDocStrArray1[i]));
                }

                HashSet <string> hashtagSet = new HashSet <string>();

                for (int i = 0; i < iDocList.Count; i++)
                {
                    Document inDoc = indexReader.Document(iDocList[i]);
                    string   text  = inDoc.Get("Text").ToLower();
                    text = Regex.Replace(text, @"\s+", " ");
                    text = Regex.Replace(text, @"#n#", "");
                    MatchCollection mc;
                    mc = Regex.Matches(text, @"#[A-Za-z0-9_]+");
                    var it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        hashtagSet.Add(it.Current.ToString());
                    }
                }

                for (int i = 0; i < iDocList1.Count; i++)
                {
                    Document inDoc = indexReader.Document(iDocList1[i]);
                    string   text  = inDoc.Get("Text").ToLower();
                    text = Regex.Replace(text, @"\s+", " ");
                    text = Regex.Replace(text, @"#n#", "");
                    MatchCollection mc;
                    mc = Regex.Matches(text, @"#[A-Za-z0-9_]+");
                    var it = mc.GetEnumerator();
                    for (int j = 0; j < mc.Count; j++)
                    {
                        it.MoveNext();
                        hashtagSet.Add(it.Current.ToString());
                    }
                }

                var iter = hashtagSet.GetEnumerator();
                for (int i = 0; i < hashtagSet.Count; i++)
                {
                    iter.MoveNext();
                    if (iter.Current != "#ebola")
                    {
                        sw.Write(iter.Current.ToString() + " ");
                    }
                }

                sw.WriteLine();
            }

            sw.Close();
            fs.Close();
            sr1.Close();
            sr.Close();
        }
Beispiel #24
0
        /// <summary>
        /// Cluster all the tweets with the representation (3-grams that often appear) of each signal tweet cluster.
        /// Actually, for each non-signal tweet, we compare its 3-grams set with representation of
        /// each signal tweet cluster to decide which cluster the non-signal tweet will be added into.
        /// Output: generalCluster.txt
        /// </summary>
        /// <param name="fileName">Lucene index folder path of tweets</param>
        /// <param name="iDoc2rec">Dictionary from tweet ID # to 3-grams record list # of signal tweets</param>
        /// <param name="gramsClList">List of unigrams, bigrams and trigrams of signal tweets</param>
        /// <param name="gList">List of tweet ID # list of general tweets (non-signal tweets) in each tweet cluster</param>
        /// <param name="minTimeStr">Time stamp string of the earliest general tweets</param>
        /// <param name="maxTimeStr">Time stamp string of the latest general tweets</param>
        public static void cluster_ori(string fileName, Dictionary <int, int> iDoc2rec, List <List <HashSet <string> > > gramsClList, List <List <int> > gList, string minTimeStr = null, string maxTimeStr = null)
        {
            double jaccard_threshold  = 0.6;
            var    indexReader        = LuceneOperations.GetIndexReader(fileName);
            int    signalClusterCount = gramsClList.Count;

            for (int i = 0; i < signalClusterCount; i++)
            {
                gList.Add(new List <int>());
            }

            for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++)
            {
                if (iDoc % 100 == 0)
                {
                    Console.WriteLine(iDoc);
                }
                if (iDoc2rec.ContainsKey(iDoc))
                {
                    continue;
                }
                Document inDoc = indexReader.Document(iDoc);

                if (minTimeStr != null && maxTimeStr != null)
                {
                    string   timeStr = inDoc.Get("CreatedAt");
                    DateTime time    = DateTime.Parse(timeStr);
                    DateTime minTime = DateTime.Parse(minTimeStr);
                    DateTime maxTime = DateTime.Parse(maxTimeStr);
                    if (DateTime.Compare(time, minTime) <= 0 || DateTime.Compare(time, maxTime) >= 0)
                    {
                        continue;
                    }
                }

                string text = inDoc.Get("Text").ToLower();
                text = Regex.Replace(text, @"\s+", " ");
                text = Regex.Replace(text, @"[^A-Za-z0-9_ ]+", "");

                string[] gramArray             = Regex.Split(text, " ");
                List <HashSet <string> > grams = new List <HashSet <string> >();

                HashSet <string> unigram = new HashSet <string>();
                for (int i = 0; i < gramArray.Length; i++)
                {
                    unigram.Add(gramArray[i]);
                }
                grams.Add(unigram);

                HashSet <string> bigram = new HashSet <string>();
                for (int i = 0; i < gramArray.Length - 1; i++)
                {
                    bigram.Add(gramArray[i] + " " + gramArray[i + 1]);
                }
                grams.Add(bigram);

                HashSet <string> trigram = new HashSet <string>();
                for (int i = 0; i < gramArray.Length - 2; i++)
                {
                    trigram.Add(gramArray[i] + " " + gramArray[i + 1] + " " + gramArray[i + 2]);
                }
                grams.Add(trigram);

                for (int i = 0; i < signalClusterCount; i++)
                {
                    if (jaccard(grams, gramsClList[i]) > jaccard_threshold)
                    {
                        gList[i].Add(iDoc);
                    }
                }
            }

            FileStream   fs = new FileStream("generalCluster.txt", FileMode.Create);
            StreamWriter sw = new StreamWriter(fs, Encoding.Default);

            int count = 0;

            for (int i = 0; i < gList.Count; i++)
            {
                count += gList[i].Count;
                sw.WriteLine(i + " " + gList[i].Count + " " + count);
                for (int j = 0; j < gList[i].Count; j++)
                {
                    sw.Write(gList[i][j] + " ");
                }
                sw.WriteLine();
                sw.WriteLine();
            }

            sw.Close();
            fs.Close();
        }