public void Start() { var reader = LuceneOperations.GetIndexReader(Configure.InputPath); var sw = new StreamWriter(Configure.OutputPath); IndexWriter writer = null; if (Configure.IsFilterByWordCount) { writer = LuceneOperations.GetIndexWriter(Configure.FilterWordCountIndexPath); } if (Configure.IsLoadFromFeatureVector) { Configure.TokenizeConfig.TokenizerType = TokenizerType.FeatureVector; } Console.WriteLine("Total: " + reader.NumDocs()); int docIndex = 0; for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { if (iDoc % 10000 == 0) { Console.WriteLine(iDoc); sw.Flush(); } string content = Configure.IsLoadFromFeatureVector ? reader.Document(iDoc).Get(BingNewsFields.FeatureVector) : LuceneOperations.GetDocumentContent(reader.Document(iDoc), Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); List <string> words = NLPOperations.Tokenize(content, Configure.TokenizeConfig);; bool isPrintDoc = !Configure.IsFilterByWordCount || words.Count >= Configure.MinWordCount; if (isPrintDoc) { if (Configure.IsFilterByWordCount) { writer.AddDocument(reader.Document(iDoc)); } sw.Write(docIndex + " " + docIndex + " "); foreach (var word in words) { sw.Write(word + " "); } sw.Write("\n"); docIndex++; } } if (Configure.IsFilterByWordCount) { writer.Optimize(); writer.Close(); } sw.Flush(); sw.Close(); reader.Close(); }
/// <summary> /// Filter out tweets within a certain time range /// Output: *.filter.txt /// </summary> /// <param name="lucenePath">Lucene index folder path of tweets</param> /// <param name="fileName">Input file path and prefix of output file</param> /// <param name="minTimeStr">Lower bound of time range</param> /// <param name="maxTimeStr">Upper bound of time range</param> public static void filterTimeRange(string lucenePath, string fileName, string minTimeStr, string maxTimeStr) { var indexReader = LuceneOperations.GetIndexReader(lucenePath); StreamReader sr = new StreamReader(fileName, Encoding.Default); FileStream fs = new FileStream(fileName + ".filter.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); string line; while ((line = sr.ReadLine()) != null) { int iDoc = int.Parse(line); Document inDoc = indexReader.Document(iDoc); string timeStr = inDoc.Get("CreatedAt"); DateTime time = DateTime.Parse(timeStr); DateTime minTime = DateTime.Parse(minTimeStr); DateTime maxTime = DateTime.Parse(maxTimeStr); if (DateTime.Compare(time, minTime) > 0 && DateTime.Compare(time, maxTime) < 0) { sw.WriteLine(iDoc); } } sw.Close(); fs.Close(); sr.Close(); }
public void StartTransformTweetIndexForStreamingRoseRiver() { string inputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01\"; string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter3_Sample0.01_MOD\"; var indexReader = LuceneOperations.GetIndexReader(inputPath); var indexWriter = LuceneOperations.GetIndexWriter(outputPath); string docIDField = BingNewsFields.DocId; string urlField = BingNewsFields.DocumentURL; ProgramProgress progress = new ProgramProgress(indexReader.NumDocs()); for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { Document inDoc = indexReader.Document(iDoc); Document outDoc = inDoc; outDoc.RemoveField(docIDField); outDoc.Add(new Field(docIDField, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); outDoc.RemoveField(urlField); outDoc.Add(new Field(urlField, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(inDoc); progress.PrintIncrementExperiment(); } indexWriter.Optimize(); indexWriter.Close(); indexReader.Close(); }
public void Start() { var writer = LuceneOperations.GetIndexWriter(OutputPath); var totalDocCnt = 0; foreach (var inputPath in InputPaths) { var reader = LuceneOperations.GetIndexReader(inputPath); totalDocCnt += reader.NumDocs(); reader.Close(); } var progress = new ProgramProgress(totalDocCnt); foreach (var inputPath in InputPaths) { var reader = LuceneOperations.GetIndexReader(inputPath); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { writer.AddDocument(reader.Document(iDoc)); progress.PrintIncrementExperiment(); } reader.Close(); } writer.Optimize(); writer.Close(); }
public static void AnalyzeFieldValues(string inputPath, string fieldName, Func <string, string> convertValueFunc = null) { if (convertValueFunc == null) { convertValueFunc = str => str; } string fileName = StringOperations.EnsureFolderEnd(inputPath) + fieldName + ".txt"; StreamWriter sw = new StreamWriter(fileName); Counter <string> counter = new Counter <string>(); var indexReader = LuceneOperations.GetIndexReader(inputPath); for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { var doc = indexReader.Document(iDoc); var value = doc.Get(fieldName); counter.Add(convertValueFunc(value)); } foreach (var kvp in counter.GetCountDictionary().OrderBy(kvp => kvp.Key)) { sw.WriteLine(kvp.Key + "\t\t" + kvp.Value); Console.WriteLine(kvp.Key + "\t\t" + kvp.Value); } sw.WriteLine("total: " + indexReader.NumDocs()); sw.Flush(); sw.Close(); indexReader.Close(); Console.ReadKey(); }
/// <summary> /// Calculate the average published time of each tweet cluster /// Output: clusterAverageTime.txt /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> public static void averageTime(string fileName) { var indexReader = LuceneOperations.GetIndexReader(fileName); StreamReader sr = new StreamReader("signalCluster.txt", Encoding.Default); StreamReader sr1 = new StreamReader("generalCluster.txt", Encoding.Default); FileStream fs = new FileStream("clusterAverageTime.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); string line; string line1; while ((line = sr.ReadLine()) != null && (line1 = sr1.ReadLine()) != null) { line = sr.ReadLine(); line1 = sr1.ReadLine(); sr.ReadLine(); sr1.ReadLine(); string[] iDocStrArray = Regex.Split(line, " "); List <int> iDocList = new List <int>(); for (int i = 0; i < iDocStrArray.Length - 1; i++) { iDocList.Add(int.Parse(iDocStrArray[i])); } string[] iDocStrArray1 = Regex.Split(line1, " "); List <int> iDocList1 = new List <int>(); for (int i = 0; i < iDocStrArray1.Length - 1; i++) { iDocList1.Add(int.Parse(iDocStrArray1[i])); } int count = iDocList.Count + iDocList1.Count; double temp = 0.0; for (int i = 0; i < iDocList.Count; i++) { Document inDoc = indexReader.Document(iDocList[i]); string timeStr = inDoc.Get("CreatedAt"); DateTime time = DateTime.Parse(timeStr); temp += (double)time.Ticks / count; } for (int i = 0; i < iDocList1.Count; i++) { Document inDoc = indexReader.Document(iDocList1[i]); string timeStr = inDoc.Get("CreatedAt"); DateTime time = DateTime.Parse(timeStr); temp += (double)time.Ticks / count; } DateTime timeAvg = new DateTime((long)temp); sw.WriteLine(timeAvg.ToString()); } sw.Close(); fs.Close(); sr1.Close(); sr.Close(); }
public static void VisualizeTree(IEnumerable <string> brtFiles, string luceneIndex = null, string[] keywords = null, bool isRemoveLeafNodes = true) { List <ITree> trees = new List <ITree>(); foreach (var brtFile in brtFiles) { //Read tree from file TreeDataParser parser = new TreeDataParser(brtFile, isRemoveLeafNodes); var tree = parser.GetTree(); Trace.WriteLine(tree.GetDepth(tree.Root)); if (luceneIndex != null) { var scheme = TreeNodeScheme.Get(tree.Graph.NodeTable); scheme.SetIndexReader(LuceneOperations.GetIndexReader(luceneIndex)); scheme.SetBRTFileName(brtFile); } trees.Add(tree); } //Print analyze info DoubleStatistics depthStat = new DoubleStatistics(); DoubleStatistics internalNodeStat = new DoubleStatistics(); foreach (var tree in trees) { depthStat.AddNumber(tree.BFS(tree.Root).Max(node => { int depth = 0; INode ancestor = node; while ((ancestor = tree.GetParent(ancestor)) != null) { depth++; } return(depth); }) + 1); internalNodeStat.AddNumber(tree.BFS(tree.Root).Count()); } Console.WriteLine(depthStat.ToString()); Console.WriteLine(internalNodeStat.ToString()); //Visualize tree Thread NetServer = new Thread(new ThreadStart(() => { TreeVisualization treeVis = new TreeVisualization(trees, keywords); })); NetServer.SetApartmentState(ApartmentState.STA); NetServer.IsBackground = true; NetServer.Start(); System.Windows.Threading.Dispatcher.Run(); }
public void Start() { string debugFileName = Configure.OutputPath + _debugFileName; if (File.Exists(debugFileName)) { File.Delete(debugFileName); } var reader = LuceneOperations.GetIndexReader(Configure.InputPath); List <int> docIDs = new List <int>(); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { docIDs.Add(iDoc); } if (Configure.IsRemoveSameURL) { Console.WriteLine("=====================RemoveSameURL====================="); docIDs = RemoveSameURLDocument(reader, docIDs); } if (Configure.IsRemoveSimilarContent) { for (int iGranu = 0; iGranu < Configure.RemoveDateGranularity.Length; iGranu++) { int timeGranu = Configure.RemoveDateGranularity[iGranu]; int wordGranu = Configure.RemoveWordGranularity[iGranu]; Console.WriteLine("========Remove Similar Document: {0} out of {1}, Granu: {2} {3}========", iGranu, Configure.RemoveDateGranularity.Length, timeGranu, wordGranu); docIDs = RemoveSimilarDocumentsGranu(reader, docIDs, timeGranu, wordGranu); } } var writer = LuceneOperations.GetIndexWriter(Configure.OutputPath); foreach (var docID in docIDs) { writer.AddDocument(reader.Document(docID)); } writer.Optimize(); writer.Close(); reader.Close(); Console.WriteLine("All done"); //Console.ReadKey(); }
/// <summary> /// Extract the unigrams, bigrams and trigrams of signal tweets. /// Need executing method MatchSignal.match_ori() first. /// Preparing step for signal tweets clustering method cluster_ori(). /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> /// <param name="gramsList">List of unigrams, bigrams and trigrams of signal tweets</param> /// <param name="rec2iDoc">Dictionary from 3-grams record list # to tweet ID #</param> /// <param name="iDoc2rec">Dictionary from tweet ID # to 3-grams record list #</param> public static void preCluster_ori(string fileName, List <List <HashSet <string> > > gramsList, Dictionary <int, int> rec2iDoc, Dictionary <int, int> iDoc2rec) { var indexReader = LuceneOperations.GetIndexReader(fileName); StreamReader sr = new StreamReader("signal.txt", Encoding.Default); string line; int recNum = 0; while ((line = sr.ReadLine()) != null) { int iDoc = int.Parse(line); Document inDoc = indexReader.Document(iDoc); string text = inDoc.Get("Text").ToLower(); text = Regex.Replace(text, @"\s+", " "); text = Regex.Replace(text, @"[^A-Za-z0-9_ ]+", ""); string[] gramArray = Regex.Split(text, " "); List <HashSet <string> > grams = new List <HashSet <string> >(); HashSet <string> unigram = new HashSet <string>(); for (int i = 0; i < gramArray.Length; i++) { unigram.Add(gramArray[i]); } grams.Add(unigram); HashSet <string> bigram = new HashSet <string>(); for (int i = 0; i < gramArray.Length - 1; i++) { bigram.Add(gramArray[i] + " " + gramArray[i + 1]); } grams.Add(bigram); HashSet <string> trigram = new HashSet <string>(); for (int i = 0; i < gramArray.Length - 2; i++) { trigram.Add(gramArray[i] + " " + gramArray[i + 1] + " " + gramArray[i + 2]); } grams.Add(trigram); if (recNum % 1000 == 0) { Console.WriteLine(recNum); } gramsList.Add(grams); rec2iDoc.Add(recNum, iDoc); iDoc2rec.Add(iDoc, recNum); recNum++; } sr.Close(); }
/// <summary> /// Match rumor patterns to find signal tweets /// Preparing step for method ClusterSignal.preCluster_ori() /// Output: signal.txt /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> public static void match_ori(string fileName) { var indexReader = LuceneOperations.GetIndexReader(fileName); FileStream fs = new FileStream("signal.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { Document inDoc = indexReader.Document(iDoc); string text = inDoc.Get("Text").ToLower(); if (Regex.IsMatch(text, @"is (this|that|it) true")) { sw.WriteLine(iDoc); continue; } if (Regex.IsMatch(text, @"(^|[^A-Za-z] )wh(a*)t([\?!]+)")) { sw.WriteLine(iDoc); continue; } if (Regex.IsMatch(text, @"(real\?|really\?|unconfirmed)")) { sw.WriteLine(iDoc); continue; } if (Regex.IsMatch(text, @"(rumor|debunk)")) { sw.WriteLine(iDoc); continue; } if (Regex.IsMatch(text, @"(that|this|it) is not true")) { sw.WriteLine(iDoc); continue; } if (iDoc % 100000 == 0) { Console.WriteLine(iDoc); } } sw.Close(); fs.Close(); }
public void AnalyzeDocuments() { string fileName = @"D:\Project\TopicPanorama\data\TopicGraphs\NewCode-Ebola-Test2\Raw\news\result\lda.top.json"; string indexPath = @"D:\DataProcess\Index\Raw_EbolaEnBingNews_Ebola_0_1_RS_R-1"; int topDocCnt = 20; var indexReader = LuceneOperations.GetIndexReader(indexPath); //Read from json and sort SimpleJsonReader reader = new SimpleJsonReader(new StreamReader(File.Open(fileName, FileMode.Open))); HeapSortDouble[] hsd = null; int topicNumber = -1; ProgramProgress progress = new ProgramProgress(indexReader.NumDocs()); while (reader.IsReadable) { int docID = int.Parse(reader.ReadPropertyName()); double[] topicArray = reader.ReadDoubleArray(); if (topicNumber < 0) { topicNumber = topicArray.Length; hsd = new HeapSortDouble[topicNumber]; for (int i = 0; i < topicNumber; i++) { hsd[i] = new HeapSortDouble(topDocCnt); } } for (int i = 0; i < topicNumber; i++) { hsd[i].Insert(docID, topicArray[i]); } progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); //Statistics Console.ReadLine(); }
/// <summary> /// Output representative tweet text of each tweet cluster /// Need executing selectRepresentative() first /// Output: clusterRepOriginalText.txt /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> public static void ouputRepresentativeOriginalText(string fileName) { var indexReader = LuceneOperations.GetIndexReader(fileName); StreamReader sr = new StreamReader("clusterRepIDoc.txt", Encoding.Default); FileStream fs = new FileStream("clusterRepOriginalText.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); string line; while ((line = sr.ReadLine()) != null) { Document inDoc = indexReader.Document(int.Parse(line)); string text = inDoc.Get("Text"); text = Regex.Replace(text, @"#N#", ""); text = Regex.Replace(text, @"#n#", ""); text = Regex.Replace(text, @"\s+", " "); sw.WriteLine(text); } sw.Close(); fs.Close(); }
public void Start() { var reader = LuceneOperations.GetIndexReader(Configure.InputPath); var docNum = reader.NumDocs(); ProgramProgress progress = new ProgramProgress(docNum); XmlDoc[] xmlDocs = new XmlDoc[docNum]; for (int iDoc = 0; iDoc < docNum; iDoc++) { var doc = reader.Document(iDoc); xmlDocs[iDoc] = new XmlDoc(doc); progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); //序列化这个对象 XmlSerializer serializer = new XmlSerializer(typeof(XmlDocCollection)); ////将对象序列化输出到控制台 serializer.Serialize(new StreamWriter(Configure.OutputPath), new XmlDocCollection() { XmlDocs = xmlDocs }); }
/// <summary> /// Output name entity set of each tweet cluster /// Output: clusterNameEntitySet.txt /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> public static void nameEntitySet(string fileName) { var indexReader = LuceneOperations.GetIndexReader(fileName); StreamReader sr = new StreamReader("signalCluster.txt", Encoding.Default); StreamReader sr1 = new StreamReader("generalCluster.txt", Encoding.Default); FileStream fs = new FileStream("clusterNameEntitySet.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); // Path to the folder with classifiers models var jarRoot = @"..\..\..\..\stanford-ner-2015-12-09"; var classifiersDirecrory = jarRoot + @"\classifiers"; // Loading 3 class classifier model var classifier = CRFClassifier.getClassifierNoExceptions( classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz"); string line; string line1; while ((line = sr.ReadLine()) != null && (line1 = sr1.ReadLine()) != null) { line = sr.ReadLine(); line1 = sr1.ReadLine(); sr.ReadLine(); sr1.ReadLine(); string[] iDocStrArray = Regex.Split(line, " "); List <int> iDocList = new List <int>(); for (int i = 0; i < iDocStrArray.Length - 1; i++) { iDocList.Add(int.Parse(iDocStrArray[i])); } string[] iDocStrArray1 = Regex.Split(line1, " "); List <int> iDocList1 = new List <int>(); for (int i = 0; i < iDocStrArray1.Length - 1; i++) { iDocList1.Add(int.Parse(iDocStrArray1[i])); } HashSet <string> nameEntitySet = new HashSet <string>(); for (int i = 0; i < iDocList.Count; i++) { Document inDoc = indexReader.Document(iDocList[i]); string text = inDoc.Get("Text"); text = Regex.Replace(text, @"\s+", " "); text = Regex.Replace(text, @"#n#|#N#", ""); text = Regex.Replace(text, @"#", ""); text = Regex.Replace(text, @"@", ""); text = classifier.classifyWithInlineXML(text); MatchCollection mc; mc = Regex.Matches(text, @"<PERSON>[^<>]+</PERSON>"); var it = mc.GetEnumerator(); for (int j = 0; j < mc.Count; j++) { it.MoveNext(); string str = it.Current.ToString(); nameEntitySet.Add(str.Substring(8, str.Length - 17)); } mc = Regex.Matches(text, @"<ORGANIZATION>[^<>]+</ORGANIZATION>"); it = mc.GetEnumerator(); for (int j = 0; j < mc.Count; j++) { it.MoveNext(); string str = it.Current.ToString(); nameEntitySet.Add(str.Substring(14, str.Length - 29)); } mc = Regex.Matches(text, @"<LOCATION>[^<>]+</LOCATION>"); it = mc.GetEnumerator(); for (int j = 0; j < mc.Count; j++) { it.MoveNext(); string str = it.Current.ToString(); nameEntitySet.Add(str.Substring(10, str.Length - 21)); } } for (int i = 0; i < iDocList1.Count; i++) { Document inDoc = indexReader.Document(iDocList1[i]); string text = inDoc.Get("Text"); text = Regex.Replace(text, @"\s+", " "); text = Regex.Replace(text, @"#n#|#N#", ""); text = Regex.Replace(text, @"#", ""); text = Regex.Replace(text, @"@", ""); text = classifier.classifyWithInlineXML(text); MatchCollection mc; mc = Regex.Matches(text, @"<PERSON>[^<>]+</PERSON>"); var it = mc.GetEnumerator(); for (int j = 0; j < mc.Count; j++) { it.MoveNext(); string str = it.Current.ToString(); nameEntitySet.Add(str.Substring(8, str.Length - 17)); } mc = Regex.Matches(text, @"<ORGANIZATION>[^<>]+</ORGANIZATION>"); it = mc.GetEnumerator(); for (int j = 0; j < mc.Count; j++) { it.MoveNext(); string str = it.Current.ToString(); nameEntitySet.Add(str.Substring(14, str.Length - 29)); } mc = Regex.Matches(text, @"<LOCATION>[^<>]+</LOCATION>"); it = mc.GetEnumerator(); for (int j = 0; j < mc.Count; j++) { it.MoveNext(); string str = it.Current.ToString(); nameEntitySet.Add(str.Substring(10, str.Length - 21)); } } var iter = nameEntitySet.GetEnumerator(); for (int i = 0; i < nameEntitySet.Count; i++) { iter.MoveNext(); sw.Write(iter.Current.ToString() + "; "); } sw.WriteLine(); } sw.Close(); fs.Close(); sr1.Close(); sr.Close(); }
public static void AnalyzeTwitterWordDistribution(string inputPath, TokenizeConfig tokenConfig) { var indexReader = LuceneOperations.GetIndexReader(inputPath); var docNum = indexReader.NumDocs(); int[] docWordCnt = new int[docNum]; int[] docUniqWordCnt = new int[docNum]; Dictionary <string, int> wordDocCntDict = new Dictionary <string, int>(); Dictionary <string, int> wordOccCntDict = new Dictionary <string, int>(); var fieldWeights = tokenConfig.TokenizerType == TokenizerType.FeatureVector ? BingNewsFields.FeatureVectorFieldWeights : BingNewsFields.NewsFieldWeights; ProgramProgress progress = new ProgramProgress(docNum); for (int iDoc = 0; iDoc < docNum; iDoc++) { var document = indexReader.Document(iDoc); var content = LuceneOperations.GetContent(document, fieldWeights); var words = NLPOperations.Tokenize(content, tokenConfig); var uniqueWords = new HashSet <string>(words); docWordCnt[iDoc] = words.Count; docUniqWordCnt[iDoc] = uniqueWords.Count; foreach (var word in uniqueWords) { if (!wordDocCntDict.ContainsKey(word)) { wordDocCntDict.Add(word, 0); } wordDocCntDict[word]++; } foreach (var word in words) { if (!wordOccCntDict.ContainsKey(word)) { wordOccCntDict.Add(word, 0); } wordOccCntDict[word]++; } progress.PrintIncrementExperiment(); } progress.PrintTotalTime(); indexReader.Close(); //Statistics DoubleStatistics statDocWordCnt = new DoubleStatistics(); DoubleStatistics statDocUniqWordCnt = new DoubleStatistics(); DoubleStatistics statWordDocCnt = new DoubleStatistics(); DoubleStatistics statWordOccCnt = new DoubleStatistics(); for (int iDoc = 0; iDoc < docNum; iDoc++) { statDocWordCnt.AddNumber(docWordCnt[iDoc]); statDocUniqWordCnt.AddNumber(docUniqWordCnt[iDoc]); } foreach (var kvp in wordDocCntDict) { statWordDocCnt.AddNumber(kvp.Value); } foreach (var kvp in wordOccCntDict) { statWordOccCnt.AddNumber(kvp.Value); } Console.WriteLine(statDocWordCnt.ToString("statDocWordCnt")); Console.WriteLine(statDocUniqWordCnt.ToString("statDocUniqWordCnt")); Console.WriteLine(statWordDocCnt.ToString("statWordDocCnt")); Console.WriteLine(statWordOccCnt.ToString("wordOccCnt")); //Hist var docWordCntHist = new DoubleHistogram(docWordCnt.Select(i => (double)i), (double)1); var docUniqueWordCntList = new DoubleHistogram(docUniqWordCnt.Select(i => (double)i), (double)1); var wordDocCntHist = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), 1000); var wordDocCntHist2 = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), (double)1); docWordCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docWordCntHist.csv"); docUniqueWordCntList.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docUniqueWordCntList.csv"); wordDocCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist.csv"); wordDocCntHist2.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist2.csv"); Console.Read(); }
public void Start() { Initialize(); var reader = LuceneOperations.GetIndexReader(Configure.InputPath); InitializeWriters(); var docNum = reader.NumDocs(); var progress = new ProgramProgress(docNum); for (int iDoc = 0; iDoc < docNum; iDoc++) { var doc = reader.Document(iDoc); bool isSkip = false; //random sample if (!isSkip && Configure.IsSampling) { if (Random.NextDouble() > Configure.SampleRatio) { isSkip = true; } } //filter by time if (!isSkip && Configure.IsSelectByTime) { var dateTime = StringOperations.ParseDateTimeString( doc.Get(Configure.TimeField), Configure.ParseTimeFormat); if (dateTime.Subtract(StartDateTime).Ticks < 0 || dateTime.Subtract(EndDateTime).Ticks > 0) { isSkip = true; } } //filter by exact match if (!isSkip && Configure.IsSelectByExactMatch) { foreach (var kvp in Configure.FieldMatchDict) { if (doc.Get(kvp.Key) != kvp.Value) { isSkip = true; break; } } } if (!isSkip) { GetWriter(doc).AddDocument(doc); } progress.PrintIncrementExperiment(); } CloseWriters(); reader.Close(); }
public void Start() { string inputPath = @"D:\DataProcess\TweetIndex\tweets-Ebola-20150101-20150228_dedup\"; string outputPath = @"D:\DataProcess\TweetIndex\EbolaTwitter2\"; var indexReader = LuceneOperations.GetIndexReader(inputPath); var indexWriter = LuceneOperations.GetIndexWriter(outputPath); char[] seperator = new char[] { ' ' }; string[] aidFields = new string[] { "User_FollowersCount", "User_Name", "User_ScreenName", "Retweet", "Mention" }; ProgramProgress progress = new ProgramProgress(indexReader.NumDocs()); //for (int iDoc = 0; iDoc < 1000; iDoc++) for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { Document inDoc = indexReader.Document(iDoc); Document outDoc = new Document(); string inTime = inDoc.Get("CreateAt"); DateTime dateTime = DateTime.Parse(inTime); outDoc.Add(new Field(BingNewsFields.DiscoveryStringTime, dateTime.ToString(BingNewsFields.TimeFormat), Field.Store.YES, Field.Index.ANALYZED)); string hashtag = inDoc.Get("Hashtag"); string word = inDoc.Get("Word"); if (hashtag == null) { hashtag = ""; } var hashtagTokens = hashtag.Split(seperator, StringSplitOptions.RemoveEmptyEntries); var wordTokens = word.Split(seperator, StringSplitOptions.RemoveEmptyEntries); string title = hashtagTokens.Length > 0 ? hashtagTokens[0] : wordTokens.Length > 0 ? wordTokens[0] : ""; outDoc.Add(new Field(BingNewsFields.NewsArticleHeadline, title, Field.Store.YES, Field.Index.ANALYZED)); outDoc.Add(new Field(BingNewsFields.NewsArticleDescription, inDoc.Get("Text"), Field.Store.YES, Field.Index.ANALYZED)); string featureVector = ""; Counter <string> counter = new Counter <string>(); foreach (var tag in hashtagTokens) { counter.Add(tag); counter.Add(tag); } foreach (var w in wordTokens) { counter.Add(w); } foreach (var kvp in counter.GetSortedCountDictioanry()) { featureVector += string.Format("{0}({1})\\n", kvp.Key, kvp.Value); } outDoc.Add(new Field(BingNewsFields.FeatureVector, featureVector, Field.Store.YES, Field.Index.ANALYZED)); outDoc.Add(new Field(BingNewsFields.DocId, iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); outDoc.Add(new Field(BingNewsFields.DocumentURL, "http://" + iDoc.ToString(), Field.Store.YES, Field.Index.ANALYZED)); foreach (var aidField in aidFields) { var value = inDoc.Get(aidField); outDoc.Add(new Field(aidField, value == null ? "" : value, Field.Store.YES, Field.Index.ANALYZED)); } indexWriter.AddDocument(outDoc); progress.PrintIncrementExperiment(); } indexWriter.Optimize(); indexWriter.Close(); indexReader.Close(); }
public void Start() { if (!Configure.InputPath.EndsWith("\\")) { Configure.InputPath += "\\"; } var reader = LuceneOperations.GetIndexReader(Configure.InputPath); var docNum = reader.NumDocs(); var docNumPart = docNum / 100; Console.WriteLine("Total: " + docNum); Random random = new Random(Configure.SampleSeed == -1 ? (int)DateTime.Now.Ticks : Configure.SampleSeed); //Topwords var counter = new Counter <string>(); for (int iDoc = 0; iDoc < docNum; iDoc++) { if (iDoc % docNumPart == 0) { Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%"); } if (random.NextDouble() > Configure.SampleRatio) { continue; } var doc = reader.Document(iDoc); var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); var words = NLPOperations.Tokenize(content, Configure.TokenizeConfig); foreach (var word in words) { counter.Add(word); } } var topwords = counter.GetMostFreqObjs(Configure.TopWordCount); var wordCounterDict = counter.GetCountDictionary(); var swTopWords = new StreamWriter(Configure.InputPath + "TopWords.txt"); foreach (var topword in topwords) { swTopWords.WriteLine(topword); } swTopWords.Flush(); swTopWords.Close(); //CoOccurrence if (Configure.IsPrintCooccurrence) { var k = topwords.Count; var occurCounterDict = new Dictionary <string, Counter <string> >(); foreach (var topword in topwords) { occurCounterDict.Add(topword, new Counter <string>()); } for (int iDoc = 0; iDoc < docNum; iDoc++) { if (iDoc % docNumPart == 0) { Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%"); } if (random.NextDouble() > Configure.SampleRatio) { continue; } var doc = reader.Document(iDoc); var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); var words = Util.GetHashSet(NLPOperations.Tokenize(content, Configure.TokenizeConfig)); foreach (var word in words) { if (occurCounterDict.ContainsKey(word)) { var occurCounter = occurCounterDict[word]; foreach (var word2 in words) { if (word2 == word) { continue; } if (occurCounterDict.ContainsKey(word2)) { occurCounter.Add(word2); } } } } } var heapSort = new HeapSortDouble(Configure.TopOccurrenceCount); var pairDict = new Dictionary <int, Tuple <string, string> >(); var iPair = 0; foreach (var kvp in occurCounterDict) { var word = kvp.Key; var occurCounter = kvp.Value; foreach (var kvp2 in occurCounter.GetCountDictionary()) { heapSort.Insert(iPair, kvp2.Value); pairDict.Add(iPair, new Tuple <string, string>(word, kvp2.Key)); iPair++; } } var swCoOccurrence = new StreamWriter(Configure.InputPath + "CoOccurrence.txt"); foreach (var kvp in heapSort.GetSortedDictionary()) { var pair = pairDict[kvp.Key]; swCoOccurrence.WriteLine("{0} - {1}\t{2}", pair.Item1, pair.Item2, kvp.Value); } swCoOccurrence.Flush(); swCoOccurrence.Close(); } reader.Close(); }
/// <summary> /// Rank general clusters with naive algorithm to find the most likely rumors /// Output: rankCluster.txt /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> /// <param name="rList">List of tweet ID # list of signal tweets in each tweet cluster</param> /// <param name="gList">List of tweet ID # list of non-signal tweets in each tweet cluster</param> public static void rank_naive(string fileName, List <List <int> > rList, List <List <int> > gList) { StreamReader sr = new StreamReader("generalCluster.txt", Encoding.Default); string line; while ((line = sr.ReadLine()) != null) { line = sr.ReadLine(); sr.ReadLine(); string[] iDocStrArray = Regex.Split(line, " "); List <int> iDocList = new List <int>(); if (iDocStrArray == null) { gList.Add(iDocList); continue; } for (int i = 0; i < iDocStrArray.Length - 1; i++) { iDocList.Add(int.Parse(iDocStrArray[i])); } gList.Add(iDocList); } sr.Close(); List <ScoreRec> scoreList = new List <ScoreRec>(); var indexReader = LuceneOperations.GetIndexReader(fileName); MatchCollection mc; int count; for (int i = 0; i < gList.Count; i++) { if (i % 10 == 0) { Console.WriteLine(i); } double score = 0.0; double count_popularity = 0.2 * Math.Log10((double)(rList[i].Count + gList[i].Count)); double count_signal = 0.3 * (double)rList[i].Count / (double)(rList[i].Count + gList[i].Count); double count_url = 0.0; double count_mention = 0.0; double count_length = 0.0; for (int j = 0; j < rList[i].Count; j++) { int iDoc = rList[i][j]; Document inDoc = indexReader.Document(iDoc); string text = inDoc.Get("Text").ToLower(); mc = Regex.Matches(text, @"http:"); count = mc.Count; if (count > 2) { count = 2; } count_url += count; mc = Regex.Matches(text, @"@"); count = mc.Count; if (count > 5) { count = 5; } count_mention += count; text = Regex.Replace(text, @"\s+", " "); text = Regex.Replace(text, @"[^A-Za-z0-9_ ]+", ""); string[] gramArray = Regex.Split(text, " "); count_length += gramArray.Length; } for (int j = 0; j < gList[i].Count; j++) { int iDoc = gList[i][j]; Document inDoc = indexReader.Document(iDoc); string text = inDoc.Get("Text").ToLower(); mc = Regex.Matches(text, @"http:"); count = mc.Count; if (count > 2) { count = 2; } count_url += count; mc = Regex.Matches(text, @"@"); count = mc.Count; if (count > 5) { count = 5; } count_mention += count; text = Regex.Replace(text, @"\s+", " "); text = Regex.Replace(text, @"[^A-Za-z0-9_ ]+", ""); string[] gramArray = Regex.Split(text, " "); count_length += gramArray.Length; } count_url /= (double)(rList[i].Count + gList[i].Count); count_mention /= (double)(rList[i].Count + gList[i].Count); count_length /= (double)(rList[i].Count + gList[i].Count); count_url = (2 - count_url) * 0.1; count_mention = (5 - count_mention) * 0.05; count_length = (140 / count_length > 10 ? 10 : 140 / count_length) * 0.02; score = count_popularity + count_signal + count_url + count_mention + count_length; scoreList.Add(new ScoreRec(score, i)); } scoreList.Sort(new ScoreRecComparer()); FileStream fs = new FileStream("rankCluster.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); for (int i = 0; i < gList.Count; i++) { Console.WriteLine(i + ": " + scoreList[i].score + " " + scoreList[i].rec); sw.WriteLine(i + ": " + scoreList[i].score + " " + scoreList[i].rec); } sw.Close(); fs.Close(); }
/// <summary> /// Calculate mention similarity matrix of tweet clusters /// </summary> public static void mentionSimilarity(string fileName) { var indexReader = LuceneOperations.GetIndexReader(fileName); StreamReader sr = new StreamReader("signalCluster.txt", Encoding.Default); StreamReader sr1 = new StreamReader("generalCluster.txt", Encoding.Default); FileStream fs = new FileStream("clusterMentionSimilarity.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); var mentionList = new List <HashSet <string> >(); string line; while ((line = sr.ReadLine()) != null) { line = sr.ReadLine(); sr.ReadLine(); string[] iDocStrArray = Regex.Split(line, " "); List <int> iDocList = new List <int>(); for (int i = 0; i < iDocStrArray.Length - 1; i++) { iDocList.Add(int.Parse(iDocStrArray[i])); } sr1.ReadLine(); line = sr1.ReadLine(); sr1.ReadLine(); iDocStrArray = Regex.Split(line, " "); for (int i = 0; i < iDocStrArray.Length - 1; i++) { iDocList.Add(int.Parse(iDocStrArray[i])); } var mention = new HashSet <string>(); for (int i = 0; i < iDocList.Count; i++) { Document inDoc = indexReader.Document(iDocList[i]); string userSrnName = inDoc.Get("UserScreenName"); mention.Add(userSrnName); string text = inDoc.Get("Text"); MatchCollection mc; mc = Regex.Matches(text, @"@[A-Za-z0-9_]+"); var it = mc.GetEnumerator(); for (int j = 0; j < mc.Count; j++) { it.MoveNext(); string str = it.Current.ToString(); mention.Add(str.Substring(1)); } } mentionList.Add(mention); } for (int i = 0; i < mentionList.Count; i++) { var mention1 = mentionList[i]; for (int j = 0; j < mentionList.Count; j++) { var mention2 = mentionList[j]; int sim = 0; foreach (var name in mention1) { if (mention2.Contains(name)) { sim = 1; break; } } sw.Write(sim + " "); } sw.WriteLine(); } sw.Close(); fs.Close(); sr1.Close(); sr.Close(); }
/// <summary> /// Select a representative tweet for each tweet cluster /// Output: clusterRepIDoc.txt, clusterRepText.txt, clusterRepWords.txt /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> /// <param name="gramsList">List of 3-grams sets of signal tweets in each signal tweet cluster</param> /// <param name="iDoc2rec">Dictionary from tweet ID # to 3-grams record list #</param> public static void selectRepresentative(string fileName, List <List <HashSet <string> > > gramsList, Dictionary <int, int> iDoc2rec) { var indexReader = LuceneOperations.GetIndexReader(fileName); StreamReader sr = new StreamReader("signalCluster.txt", Encoding.Default); FileStream fs = new FileStream("clusterRepIDoc.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); FileStream fs1 = new FileStream("clusterRepText.txt", FileMode.Create); StreamWriter sw1 = new StreamWriter(fs1, Encoding.Default); FileStream fs2 = new FileStream("clusterRepWords.txt", FileMode.Create); StreamWriter sw2 = new StreamWriter(fs2, Encoding.Default); string line; while ((line = sr.ReadLine()) != null) { line = sr.ReadLine(); sr.ReadLine(); string[] iDocStrArray = Regex.Split(line, " "); List <int> iDocList = new List <int>(); for (int i = 0; i < iDocStrArray.Length - 1; i++) { iDocList.Add(int.Parse(iDocStrArray[i])); } double[] simArr = new double[iDocList.Count]; for (int i = 0; i < iDocList.Count; i++) { simArr[i] = 0.0; } for (int i = 0; i < iDocList.Count; i++) { int rec1 = iDoc2rec[iDocList[i]]; for (int j = i + 1; j < iDocList.Count; j++) { int rec2 = iDoc2rec[iDocList[j]]; double sim = ClusterGeneral.jaccard(gramsList[rec1], gramsList[rec2]); simArr[i] += sim; simArr[j] += sim; } } if (iDocList.Count > 1) { for (int i = 0; i < iDocList.Count; i++) { simArr[i] /= (iDocList.Count - 1); } } double maxSim = -1.0; int maxSimIndex = -1; for (int i = 0; i < iDocList.Count; i++) { if (simArr[i] > maxSim) { maxSim = simArr[i]; maxSimIndex = i; } } int iDoc = iDocList[maxSimIndex]; Document inDoc = indexReader.Document(iDoc); string text = inDoc.Get("Text").ToLower(); text = Regex.Replace(text, @"\s+", " "); text = Regex.Replace(text, @"#n#", ""); string words = Regex.Replace(text, @"[^A-Za-z0-9_ ]+", ""); sw.WriteLine(iDoc); sw1.WriteLine(text); sw2.WriteLine(words); } sw2.Close(); fs2.Close(); sw1.Close(); fs1.Close(); sw.Close(); fs.Close(); sr.Close(); }
public static void Test() { string indexPath = @"C:\Users\v-xitwan\Desktop\temp\WeiboIndex\WeiboSortByHotIndex_Time_RemoveNoise2_RemoveSimilar2"; var reader = LuceneOperations.GetIndexReader(indexPath); //var keywords = new string[]{"街","信","死","女","清","刷","骂","愿","爱","查","舰","版","通","岁","撕"}; //foreach (var keyword in keywords) { var sw = new StreamWriter(@"C:\Users\v-xitwan\Desktop\temp\WeiboIndex\TestTokenizer" + "Stat" + ".txt", false, Encoding.UTF8); //ChineseWordBreaker chineseWordBreaker = new ChineseWordBreaker(@"Utils\Lib\WordBreaker\"); int cnt1 = 0, cnt2 = 0; int cnt1all = 0, cnt2all = 0; for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { string sentence = reader.Document(iDoc).Get("NewsArticleDescription"); var words1 = NLPOperations.Tokenize(sentence, new TokenizeConfig(TokenizerType.ICTCLAS, StopWordsFile.CH)); var words2 = NLPOperations.Tokenize(sentence, new TokenizeConfig(TokenizerType.ChineseWordBreaker, StopWordsFile.CH)); //bool isPrint = false; //foreach (var word in words1) // if (word.Length == 1) // { // isPrint = true; // cnt1++; // } //foreach (var word in words2) // if (word.Length == 2) // { // isPrint = true; // cnt2++; // } cnt1all += words1.Count; cnt2all += words2.Count; //if (isPrint) //{ // sw.WriteLine("-------------{0}-------------", iDoc); // sw.WriteLine(sentence); // sw.WriteLine("[ICT]\t" + StringOperations.GetMergedString(words1)); // sw.WriteLine("[CWB]\t" + StringOperations.GetMergedString(words2)); // sw.WriteLine("[ICT--]\t" + Marshal.PtrToStringAnsi(NLPIR_ParagraphProcess(sentence, 1))); // //sw.WriteLine("[CWB--]\t" + chineseWordBreaker.GetResult(sentence)); // sw.WriteLine(); // sw.Flush(); //} } sw.WriteLine("cnt1 = " + cnt1); sw.WriteLine("cnt2 = " + cnt2); sw.WriteLine("cnt1all = " + cnt1all); sw.WriteLine("cnt2all = " + cnt2all); sw.Flush(); sw.Close(); } }
/// <summary> /// Output hashtag set of each tweet cluster /// Output: clusterHashtagSet.txt /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> public static void hashtagSet(string fileName) { var indexReader = LuceneOperations.GetIndexReader(fileName); StreamReader sr = new StreamReader("signalCluster.txt", Encoding.Default); StreamReader sr1 = new StreamReader("generalCluster.txt", Encoding.Default); FileStream fs = new FileStream("clusterHashtagSet.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); string line; string line1; while ((line = sr.ReadLine()) != null && (line1 = sr1.ReadLine()) != null) { line = sr.ReadLine(); line1 = sr1.ReadLine(); sr.ReadLine(); sr1.ReadLine(); string[] iDocStrArray = Regex.Split(line, " "); List <int> iDocList = new List <int>(); for (int i = 0; i < iDocStrArray.Length - 1; i++) { iDocList.Add(int.Parse(iDocStrArray[i])); } string[] iDocStrArray1 = Regex.Split(line1, " "); List <int> iDocList1 = new List <int>(); for (int i = 0; i < iDocStrArray1.Length - 1; i++) { iDocList1.Add(int.Parse(iDocStrArray1[i])); } HashSet <string> hashtagSet = new HashSet <string>(); for (int i = 0; i < iDocList.Count; i++) { Document inDoc = indexReader.Document(iDocList[i]); string text = inDoc.Get("Text").ToLower(); text = Regex.Replace(text, @"\s+", " "); text = Regex.Replace(text, @"#n#", ""); MatchCollection mc; mc = Regex.Matches(text, @"#[A-Za-z0-9_]+"); var it = mc.GetEnumerator(); for (int j = 0; j < mc.Count; j++) { it.MoveNext(); hashtagSet.Add(it.Current.ToString()); } } for (int i = 0; i < iDocList1.Count; i++) { Document inDoc = indexReader.Document(iDocList1[i]); string text = inDoc.Get("Text").ToLower(); text = Regex.Replace(text, @"\s+", " "); text = Regex.Replace(text, @"#n#", ""); MatchCollection mc; mc = Regex.Matches(text, @"#[A-Za-z0-9_]+"); var it = mc.GetEnumerator(); for (int j = 0; j < mc.Count; j++) { it.MoveNext(); hashtagSet.Add(it.Current.ToString()); } } var iter = hashtagSet.GetEnumerator(); for (int i = 0; i < hashtagSet.Count; i++) { iter.MoveNext(); if (iter.Current != "#ebola") { sw.Write(iter.Current.ToString() + " "); } } sw.WriteLine(); } sw.Close(); fs.Close(); sr1.Close(); sr.Close(); }
/// <summary> /// Cluster all the tweets with the representation (3-grams that often appear) of each signal tweet cluster. /// Actually, for each non-signal tweet, we compare its 3-grams set with representation of /// each signal tweet cluster to decide which cluster the non-signal tweet will be added into. /// Output: generalCluster.txt /// </summary> /// <param name="fileName">Lucene index folder path of tweets</param> /// <param name="iDoc2rec">Dictionary from tweet ID # to 3-grams record list # of signal tweets</param> /// <param name="gramsClList">List of unigrams, bigrams and trigrams of signal tweets</param> /// <param name="gList">List of tweet ID # list of general tweets (non-signal tweets) in each tweet cluster</param> /// <param name="minTimeStr">Time stamp string of the earliest general tweets</param> /// <param name="maxTimeStr">Time stamp string of the latest general tweets</param> public static void cluster_ori(string fileName, Dictionary <int, int> iDoc2rec, List <List <HashSet <string> > > gramsClList, List <List <int> > gList, string minTimeStr = null, string maxTimeStr = null) { double jaccard_threshold = 0.6; var indexReader = LuceneOperations.GetIndexReader(fileName); int signalClusterCount = gramsClList.Count; for (int i = 0; i < signalClusterCount; i++) { gList.Add(new List <int>()); } for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++) { if (iDoc % 100 == 0) { Console.WriteLine(iDoc); } if (iDoc2rec.ContainsKey(iDoc)) { continue; } Document inDoc = indexReader.Document(iDoc); if (minTimeStr != null && maxTimeStr != null) { string timeStr = inDoc.Get("CreatedAt"); DateTime time = DateTime.Parse(timeStr); DateTime minTime = DateTime.Parse(minTimeStr); DateTime maxTime = DateTime.Parse(maxTimeStr); if (DateTime.Compare(time, minTime) <= 0 || DateTime.Compare(time, maxTime) >= 0) { continue; } } string text = inDoc.Get("Text").ToLower(); text = Regex.Replace(text, @"\s+", " "); text = Regex.Replace(text, @"[^A-Za-z0-9_ ]+", ""); string[] gramArray = Regex.Split(text, " "); List <HashSet <string> > grams = new List <HashSet <string> >(); HashSet <string> unigram = new HashSet <string>(); for (int i = 0; i < gramArray.Length; i++) { unigram.Add(gramArray[i]); } grams.Add(unigram); HashSet <string> bigram = new HashSet <string>(); for (int i = 0; i < gramArray.Length - 1; i++) { bigram.Add(gramArray[i] + " " + gramArray[i + 1]); } grams.Add(bigram); HashSet <string> trigram = new HashSet <string>(); for (int i = 0; i < gramArray.Length - 2; i++) { trigram.Add(gramArray[i] + " " + gramArray[i + 1] + " " + gramArray[i + 2]); } grams.Add(trigram); for (int i = 0; i < signalClusterCount; i++) { if (jaccard(grams, gramsClList[i]) > jaccard_threshold) { gList[i].Add(iDoc); } } } FileStream fs = new FileStream("generalCluster.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs, Encoding.Default); int count = 0; for (int i = 0; i < gList.Count; i++) { count += gList[i].Count; sw.WriteLine(i + " " + gList[i].Count + " " + count); for (int j = 0; j < gList[i].Count; j++) { sw.Write(gList[i][j] + " "); } sw.WriteLine(); sw.WriteLine(); } sw.Close(); fs.Close(); }